github.com/apache/arrow/go/v14@v14.0.2/parquet/internal/encoding/encoding_test.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package encoding_test 18 19 import ( 20 "bufio" 21 "fmt" 22 "os" 23 "path" 24 "reflect" 25 "strconv" 26 "testing" 27 "unsafe" 28 29 "github.com/apache/arrow/go/v14/arrow" 30 "github.com/apache/arrow/go/v14/arrow/bitutil" 31 "github.com/apache/arrow/go/v14/arrow/memory" 32 "github.com/apache/arrow/go/v14/parquet" 33 "github.com/apache/arrow/go/v14/parquet/internal/encoding" 34 "github.com/apache/arrow/go/v14/parquet/internal/testutils" 35 "github.com/apache/arrow/go/v14/parquet/schema" 36 "github.com/stretchr/testify/assert" 37 "github.com/stretchr/testify/require" 38 "github.com/stretchr/testify/suite" 39 ) 40 41 type nodeFactory func(string, parquet.Repetition, int32) *schema.PrimitiveNode 42 43 func createNodeFactory(t reflect.Type) nodeFactory { 44 switch t { 45 case reflect.TypeOf(true): 46 return schema.NewBooleanNode 47 case reflect.TypeOf(int32(0)): 48 return schema.NewInt32Node 49 case reflect.TypeOf(int64(0)): 50 return schema.NewInt64Node 51 case reflect.TypeOf(parquet.Int96{}): 52 return schema.NewInt96Node 53 case reflect.TypeOf(float32(0)): 54 return schema.NewFloat32Node 55 case reflect.TypeOf(float64(0)): 56 return schema.NewFloat64Node 57 case reflect.TypeOf(parquet.ByteArray{}): 58 return schema.NewByteArrayNode 59 case reflect.TypeOf(parquet.FixedLenByteArray{}): 60 return func(name string, rep parquet.Repetition, field int32) *schema.PrimitiveNode { 61 return schema.NewFixedLenByteArrayNode(name, rep, 12, field) 62 } 63 } 64 return nil 65 } 66 67 func initdata(t reflect.Type, drawbuf, decodebuf []byte, nvals, repeats int, heap *memory.Buffer) (interface{}, interface{}) { 68 switch t { 69 case reflect.TypeOf(true): 70 draws := *(*[]bool)(unsafe.Pointer(&drawbuf)) 71 decode := *(*[]bool)(unsafe.Pointer(&decodebuf)) 72 testutils.InitValues(draws[:nvals], heap) 73 74 for j := 1; j < repeats; j++ { 75 for k := 0; k < nvals; k++ { 76 draws[nvals*j+k] = draws[k] 77 } 78 } 79 80 return draws[:nvals*repeats], decode[:nvals*repeats] 81 case reflect.TypeOf(int32(0)): 82 draws := arrow.Int32Traits.CastFromBytes(drawbuf) 83 decode := arrow.Int32Traits.CastFromBytes(decodebuf) 84 testutils.InitValues(draws[:nvals], heap) 85 86 for j := 1; j < repeats; j++ { 87 for k := 0; k < nvals; k++ { 88 draws[nvals*j+k] = draws[k] 89 } 90 } 91 92 return draws[:nvals*repeats], decode[:nvals*repeats] 93 case reflect.TypeOf(int64(0)): 94 draws := arrow.Int64Traits.CastFromBytes(drawbuf) 95 decode := arrow.Int64Traits.CastFromBytes(decodebuf) 96 testutils.InitValues(draws[:nvals], heap) 97 98 for j := 1; j < repeats; j++ { 99 for k := 0; k < nvals; k++ { 100 draws[nvals*j+k] = draws[k] 101 } 102 } 103 104 return draws[:nvals*repeats], decode[:nvals*repeats] 105 case reflect.TypeOf(parquet.Int96{}): 106 draws := parquet.Int96Traits.CastFromBytes(drawbuf) 107 decode := parquet.Int96Traits.CastFromBytes(decodebuf) 108 testutils.InitValues(draws[:nvals], heap) 109 110 for j := 1; j < repeats; j++ { 111 for k := 0; k < nvals; k++ { 112 draws[nvals*j+k] = draws[k] 113 } 114 } 115 116 return draws[:nvals*repeats], decode[:nvals*repeats] 117 case reflect.TypeOf(float32(0)): 118 draws := arrow.Float32Traits.CastFromBytes(drawbuf) 119 decode := arrow.Float32Traits.CastFromBytes(decodebuf) 120 testutils.InitValues(draws[:nvals], heap) 121 122 for j := 1; j < repeats; j++ { 123 for k := 0; k < nvals; k++ { 124 draws[nvals*j+k] = draws[k] 125 } 126 } 127 128 return draws[:nvals*repeats], decode[:nvals*repeats] 129 case reflect.TypeOf(float64(0)): 130 draws := arrow.Float64Traits.CastFromBytes(drawbuf) 131 decode := arrow.Float64Traits.CastFromBytes(decodebuf) 132 testutils.InitValues(draws[:nvals], heap) 133 134 for j := 1; j < repeats; j++ { 135 for k := 0; k < nvals; k++ { 136 draws[nvals*j+k] = draws[k] 137 } 138 } 139 140 return draws[:nvals*repeats], decode[:nvals*repeats] 141 case reflect.TypeOf(parquet.ByteArray{}): 142 draws := make([]parquet.ByteArray, nvals*repeats) 143 decode := make([]parquet.ByteArray, nvals*repeats) 144 testutils.InitValues(draws[:nvals], heap) 145 146 for j := 1; j < repeats; j++ { 147 for k := 0; k < nvals; k++ { 148 draws[nvals*j+k] = draws[k] 149 } 150 } 151 152 return draws[:nvals*repeats], decode[:nvals*repeats] 153 case reflect.TypeOf(parquet.FixedLenByteArray{}): 154 draws := make([]parquet.FixedLenByteArray, nvals*repeats) 155 decode := make([]parquet.FixedLenByteArray, nvals*repeats) 156 testutils.InitValues(draws[:nvals], heap) 157 158 for j := 1; j < repeats; j++ { 159 for k := 0; k < nvals; k++ { 160 draws[nvals*j+k] = draws[k] 161 } 162 } 163 164 return draws[:nvals*repeats], decode[:nvals*repeats] 165 } 166 return nil, nil 167 } 168 169 func encode(enc encoding.TypedEncoder, vals interface{}) { 170 switch v := vals.(type) { 171 case []bool: 172 enc.(encoding.BooleanEncoder).Put(v) 173 case []int32: 174 enc.(encoding.Int32Encoder).Put(v) 175 case []int64: 176 enc.(encoding.Int64Encoder).Put(v) 177 case []parquet.Int96: 178 enc.(encoding.Int96Encoder).Put(v) 179 case []float32: 180 enc.(encoding.Float32Encoder).Put(v) 181 case []float64: 182 enc.(encoding.Float64Encoder).Put(v) 183 case []parquet.ByteArray: 184 enc.(encoding.ByteArrayEncoder).Put(v) 185 case []parquet.FixedLenByteArray: 186 enc.(encoding.FixedLenByteArrayEncoder).Put(v) 187 } 188 } 189 190 func encodeSpaced(enc encoding.TypedEncoder, vals interface{}, validBits []byte, validBitsOffset int64) { 191 switch v := vals.(type) { 192 case []bool: 193 enc.(encoding.BooleanEncoder).PutSpaced(v, validBits, validBitsOffset) 194 case []int32: 195 enc.(encoding.Int32Encoder).PutSpaced(v, validBits, validBitsOffset) 196 case []int64: 197 enc.(encoding.Int64Encoder).PutSpaced(v, validBits, validBitsOffset) 198 case []parquet.Int96: 199 enc.(encoding.Int96Encoder).PutSpaced(v, validBits, validBitsOffset) 200 case []float32: 201 enc.(encoding.Float32Encoder).PutSpaced(v, validBits, validBitsOffset) 202 case []float64: 203 enc.(encoding.Float64Encoder).PutSpaced(v, validBits, validBitsOffset) 204 case []parquet.ByteArray: 205 enc.(encoding.ByteArrayEncoder).PutSpaced(v, validBits, validBitsOffset) 206 case []parquet.FixedLenByteArray: 207 enc.(encoding.FixedLenByteArrayEncoder).PutSpaced(v, validBits, validBitsOffset) 208 } 209 } 210 211 func decode(dec encoding.TypedDecoder, out interface{}) (int, error) { 212 switch v := out.(type) { 213 case []bool: 214 return dec.(encoding.BooleanDecoder).Decode(v) 215 case []int32: 216 return dec.(encoding.Int32Decoder).Decode(v) 217 case []int64: 218 return dec.(encoding.Int64Decoder).Decode(v) 219 case []parquet.Int96: 220 return dec.(encoding.Int96Decoder).Decode(v) 221 case []float32: 222 return dec.(encoding.Float32Decoder).Decode(v) 223 case []float64: 224 return dec.(encoding.Float64Decoder).Decode(v) 225 case []parquet.ByteArray: 226 return dec.(encoding.ByteArrayDecoder).Decode(v) 227 case []parquet.FixedLenByteArray: 228 return dec.(encoding.FixedLenByteArrayDecoder).Decode(v) 229 } 230 return 0, nil 231 } 232 233 func decodeSpaced(dec encoding.TypedDecoder, out interface{}, nullCount int, validBits []byte, validBitsOffset int64) (int, error) { 234 switch v := out.(type) { 235 case []bool: 236 return dec.(encoding.BooleanDecoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset) 237 case []int32: 238 return dec.(encoding.Int32Decoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset) 239 case []int64: 240 return dec.(encoding.Int64Decoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset) 241 case []parquet.Int96: 242 return dec.(encoding.Int96Decoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset) 243 case []float32: 244 return dec.(encoding.Float32Decoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset) 245 case []float64: 246 return dec.(encoding.Float64Decoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset) 247 case []parquet.ByteArray: 248 return dec.(encoding.ByteArrayDecoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset) 249 case []parquet.FixedLenByteArray: 250 return dec.(encoding.FixedLenByteArrayDecoder).DecodeSpaced(v, nullCount, validBits, validBitsOffset) 251 } 252 return 0, nil 253 } 254 255 type BaseEncodingTestSuite struct { 256 suite.Suite 257 258 descr *schema.Column 259 typeLen int 260 mem memory.Allocator 261 typ reflect.Type 262 263 nvalues int 264 heap *memory.Buffer 265 inputBytes *memory.Buffer 266 outputBytes *memory.Buffer 267 nodeFactory nodeFactory 268 269 draws interface{} 270 decodeBuf interface{} 271 } 272 273 func (b *BaseEncodingTestSuite) SetupSuite() { 274 b.mem = memory.DefaultAllocator 275 b.inputBytes = memory.NewResizableBuffer(b.mem) 276 b.outputBytes = memory.NewResizableBuffer(b.mem) 277 b.heap = memory.NewResizableBuffer(b.mem) 278 b.nodeFactory = createNodeFactory(b.typ) 279 } 280 281 func (b *BaseEncodingTestSuite) TearDownSuite() { 282 b.inputBytes.Release() 283 b.outputBytes.Release() 284 b.heap.Release() 285 } 286 287 func (b *BaseEncodingTestSuite) SetupTest() { 288 b.descr = schema.NewColumn(b.nodeFactory("name", parquet.Repetitions.Optional, -1), 0, 0) 289 b.typeLen = int(b.descr.TypeLength()) 290 } 291 292 func (b *BaseEncodingTestSuite) initData(nvalues, repeats int) { 293 b.nvalues = nvalues * repeats 294 b.inputBytes.ResizeNoShrink(b.nvalues * int(b.typ.Size())) 295 b.outputBytes.ResizeNoShrink(b.nvalues * int(b.typ.Size())) 296 memory.Set(b.inputBytes.Buf(), 0) 297 memory.Set(b.outputBytes.Buf(), 0) 298 299 b.draws, b.decodeBuf = initdata(b.typ, b.inputBytes.Buf(), b.outputBytes.Buf(), nvalues, repeats, b.heap) 300 } 301 302 func (b *BaseEncodingTestSuite) encodeTestData(e parquet.Encoding) (encoding.Buffer, error) { 303 enc := encoding.NewEncoder(testutils.TypeToParquetType(b.typ), e, false, b.descr, memory.DefaultAllocator) 304 b.Equal(e, enc.Encoding()) 305 b.Equal(b.descr.PhysicalType(), enc.Type()) 306 encode(enc, reflect.ValueOf(b.draws).Slice(0, b.nvalues).Interface()) 307 return enc.FlushValues() 308 } 309 310 func (b *BaseEncodingTestSuite) decodeTestData(e parquet.Encoding, buf []byte) { 311 dec := encoding.NewDecoder(testutils.TypeToParquetType(b.typ), e, b.descr, b.mem) 312 b.Equal(e, dec.Encoding()) 313 b.Equal(b.descr.PhysicalType(), dec.Type()) 314 315 dec.SetData(b.nvalues, buf) 316 decoded, _ := decode(dec, b.decodeBuf) 317 b.Equal(b.nvalues, decoded) 318 b.Equal(reflect.ValueOf(b.draws).Slice(0, b.nvalues).Interface(), reflect.ValueOf(b.decodeBuf).Slice(0, b.nvalues).Interface()) 319 } 320 321 func (b *BaseEncodingTestSuite) encodeTestDataSpaced(e parquet.Encoding, validBits []byte, validBitsOffset int64) (encoding.Buffer, error) { 322 enc := encoding.NewEncoder(testutils.TypeToParquetType(b.typ), e, false, b.descr, memory.DefaultAllocator) 323 encodeSpaced(enc, reflect.ValueOf(b.draws).Slice(0, b.nvalues).Interface(), validBits, validBitsOffset) 324 return enc.FlushValues() 325 } 326 327 func (b *BaseEncodingTestSuite) decodeTestDataSpaced(e parquet.Encoding, nullCount int, buf []byte, validBits []byte, validBitsOffset int64) { 328 dec := encoding.NewDecoder(testutils.TypeToParquetType(b.typ), e, b.descr, b.mem) 329 dec.SetData(b.nvalues-nullCount, buf) 330 decoded, _ := decodeSpaced(dec, b.decodeBuf, nullCount, validBits, validBitsOffset) 331 b.Equal(b.nvalues, decoded) 332 333 drawval := reflect.ValueOf(b.draws) 334 decodeval := reflect.ValueOf(b.decodeBuf) 335 for j := 0; j < b.nvalues; j++ { 336 if bitutil.BitIsSet(validBits, int(validBitsOffset)+j) { 337 b.Equal(drawval.Index(j).Interface(), decodeval.Index(j).Interface()) 338 } 339 } 340 } 341 342 func (b *BaseEncodingTestSuite) checkRoundTrip(e parquet.Encoding) { 343 buf, _ := b.encodeTestData(e) 344 defer buf.Release() 345 b.decodeTestData(e, buf.Bytes()) 346 } 347 348 func (b *BaseEncodingTestSuite) checkRoundTripSpaced(e parquet.Encoding, validBits []byte, validBitsOffset int64) { 349 buf, _ := b.encodeTestDataSpaced(e, validBits, validBitsOffset) 350 defer buf.Release() 351 352 nullCount := 0 353 for i := 0; i < b.nvalues; i++ { 354 if bitutil.BitIsNotSet(validBits, int(validBitsOffset)+i) { 355 nullCount++ 356 } 357 } 358 b.decodeTestDataSpaced(e, nullCount, buf.Bytes(), validBits, validBitsOffset) 359 } 360 361 func (b *BaseEncodingTestSuite) TestBasicRoundTrip() { 362 b.initData(10000, 1) 363 b.checkRoundTrip(parquet.Encodings.Plain) 364 } 365 366 func (b *BaseEncodingTestSuite) TestDeltaEncodingRoundTrip() { 367 b.initData(10000, 1) 368 369 switch b.typ { 370 case reflect.TypeOf(int32(0)), reflect.TypeOf(int64(0)): 371 b.checkRoundTrip(parquet.Encodings.DeltaBinaryPacked) 372 default: 373 b.Panics(func() { b.checkRoundTrip(parquet.Encodings.DeltaBinaryPacked) }) 374 } 375 } 376 377 func (b *BaseEncodingTestSuite) TestDeltaLengthByteArrayRoundTrip() { 378 b.initData(10000, 1) 379 380 switch b.typ { 381 case reflect.TypeOf(parquet.ByteArray{}): 382 b.checkRoundTrip(parquet.Encodings.DeltaLengthByteArray) 383 default: 384 b.Panics(func() { b.checkRoundTrip(parquet.Encodings.DeltaLengthByteArray) }) 385 } 386 } 387 388 func (b *BaseEncodingTestSuite) TestDeltaByteArrayRoundTrip() { 389 b.initData(10000, 1) 390 391 switch b.typ { 392 case reflect.TypeOf(parquet.ByteArray{}): 393 b.checkRoundTrip(parquet.Encodings.DeltaByteArray) 394 default: 395 b.Panics(func() { b.checkRoundTrip(parquet.Encodings.DeltaLengthByteArray) }) 396 } 397 } 398 399 func (b *BaseEncodingTestSuite) TestSpacedRoundTrip() { 400 exec := func(vals, repeats int, validBitsOffset int64, nullProb float64) { 401 b.Run(fmt.Sprintf("%d vals %d repeats %d offset %0.3f null", vals, repeats, validBitsOffset, 1-nullProb), func() { 402 b.initData(vals, repeats) 403 404 size := int64(b.nvalues) + validBitsOffset 405 r := testutils.NewRandomArrayGenerator(1923) 406 arr := r.Uint8(size, 0, 100, 1-nullProb) 407 validBits := arr.NullBitmapBytes() 408 if validBits != nil { 409 b.checkRoundTripSpaced(parquet.Encodings.Plain, validBits, validBitsOffset) 410 switch b.typ { 411 case reflect.TypeOf(int32(0)), reflect.TypeOf(int64(0)): 412 b.checkRoundTripSpaced(parquet.Encodings.DeltaBinaryPacked, validBits, validBitsOffset) 413 case reflect.TypeOf(parquet.ByteArray{}): 414 b.checkRoundTripSpaced(parquet.Encodings.DeltaLengthByteArray, validBits, validBitsOffset) 415 b.checkRoundTripSpaced(parquet.Encodings.DeltaByteArray, validBits, validBitsOffset) 416 } 417 } 418 }) 419 } 420 421 const ( 422 avx512Size = 64 423 simdSize = avx512Size 424 multiSimdSize = simdSize * 33 425 ) 426 427 for _, nullProb := range []float64{0.001, 0.1, 0.5, 0.9, 0.999} { 428 // Test with both size and offset up to 3 simd block 429 for i := 1; i < simdSize*3; i++ { 430 exec(i, 1, 0, nullProb) 431 exec(i, 1, int64(i+1), nullProb) 432 } 433 // large block and offset 434 exec(multiSimdSize, 1, 0, nullProb) 435 exec(multiSimdSize+33, 1, 0, nullProb) 436 exec(multiSimdSize, 1, 33, nullProb) 437 exec(multiSimdSize+33, 1, 33, nullProb) 438 } 439 } 440 441 func TestEncoding(t *testing.T) { 442 tests := []struct { 443 name string 444 typ reflect.Type 445 }{ 446 {"Bool", reflect.TypeOf(true)}, 447 {"Int32", reflect.TypeOf(int32(0))}, 448 {"Int64", reflect.TypeOf(int64(0))}, 449 {"Float32", reflect.TypeOf(float32(0))}, 450 {"Float64", reflect.TypeOf(float64(0))}, 451 {"Int96", reflect.TypeOf(parquet.Int96{})}, 452 {"ByteArray", reflect.TypeOf(parquet.ByteArray{})}, 453 {"FixedLenByteArray", reflect.TypeOf(parquet.FixedLenByteArray{})}, 454 } 455 456 for _, tt := range tests { 457 t.Run(tt.name, func(t *testing.T) { 458 suite.Run(t, &BaseEncodingTestSuite{typ: tt.typ}) 459 }) 460 } 461 } 462 463 type DictionaryEncodingTestSuite struct { 464 BaseEncodingTestSuite 465 } 466 467 func (d *DictionaryEncodingTestSuite) encodeTestDataDict(e parquet.Encoding) (dictBuffer, indices encoding.Buffer, numEntries int) { 468 enc := encoding.NewEncoder(testutils.TypeToParquetType(d.typ), e, true, d.descr, memory.DefaultAllocator).(encoding.DictEncoder) 469 470 d.Equal(parquet.Encodings.PlainDict, enc.Encoding()) 471 d.Equal(d.descr.PhysicalType(), enc.Type()) 472 encode(enc, reflect.ValueOf(d.draws).Slice(0, d.nvalues).Interface()) 473 dictBuffer = memory.NewResizableBuffer(d.mem) 474 dictBuffer.Resize(enc.DictEncodedSize()) 475 enc.WriteDict(dictBuffer.Bytes()) 476 indices, _ = enc.FlushValues() 477 numEntries = enc.NumEntries() 478 return 479 } 480 481 func (d *DictionaryEncodingTestSuite) encodeTestDataDictSpaced(e parquet.Encoding, validBits []byte, validBitsOffset int64) (dictBuffer, indices encoding.Buffer, numEntries int) { 482 enc := encoding.NewEncoder(testutils.TypeToParquetType(d.typ), e, true, d.descr, memory.DefaultAllocator).(encoding.DictEncoder) 483 d.Equal(d.descr.PhysicalType(), enc.Type()) 484 485 encodeSpaced(enc, reflect.ValueOf(d.draws).Slice(0, d.nvalues).Interface(), validBits, validBitsOffset) 486 dictBuffer = memory.NewResizableBuffer(d.mem) 487 dictBuffer.Resize(enc.DictEncodedSize()) 488 enc.WriteDict(dictBuffer.Bytes()) 489 indices, _ = enc.FlushValues() 490 numEntries = enc.NumEntries() 491 return 492 } 493 494 func (d *DictionaryEncodingTestSuite) checkRoundTrip() { 495 dictBuffer, indices, numEntries := d.encodeTestDataDict(parquet.Encodings.Plain) 496 defer dictBuffer.Release() 497 defer indices.Release() 498 validBits := make([]byte, int(bitutil.BytesForBits(int64(d.nvalues)))+1) 499 memory.Set(validBits, 255) 500 501 spacedBuffer, indicesSpaced, _ := d.encodeTestDataDictSpaced(parquet.Encodings.Plain, validBits, 0) 502 defer spacedBuffer.Release() 503 defer indicesSpaced.Release() 504 d.Equal(indices.Bytes(), indicesSpaced.Bytes()) 505 506 dictDecoder := encoding.NewDecoder(testutils.TypeToParquetType(d.typ), parquet.Encodings.Plain, d.descr, d.mem) 507 d.Equal(d.descr.PhysicalType(), dictDecoder.Type()) 508 dictDecoder.SetData(numEntries, dictBuffer.Bytes()) 509 decoder := encoding.NewDictDecoder(testutils.TypeToParquetType(d.typ), d.descr, d.mem) 510 decoder.SetDict(dictDecoder) 511 decoder.SetData(d.nvalues, indices.Bytes()) 512 513 decoded, _ := decode(decoder, d.decodeBuf) 514 d.Equal(d.nvalues, decoded) 515 d.Equal(reflect.ValueOf(d.draws).Slice(0, d.nvalues).Interface(), reflect.ValueOf(d.decodeBuf).Slice(0, d.nvalues).Interface()) 516 517 decoder.SetData(d.nvalues, indices.Bytes()) 518 decoded, _ = decodeSpaced(decoder, d.decodeBuf, 0, validBits, 0) 519 d.Equal(d.nvalues, decoded) 520 d.Equal(reflect.ValueOf(d.draws).Slice(0, d.nvalues).Interface(), reflect.ValueOf(d.decodeBuf).Slice(0, d.nvalues).Interface()) 521 } 522 523 func (d *DictionaryEncodingTestSuite) TestBasicRoundTrip() { 524 d.initData(2500, 2) 525 d.checkRoundTrip() 526 } 527 528 func TestDictEncoding(t *testing.T) { 529 tests := []struct { 530 name string 531 typ reflect.Type 532 }{ 533 {"Int32", reflect.TypeOf(int32(0))}, 534 {"Int64", reflect.TypeOf(int64(0))}, 535 {"Float32", reflect.TypeOf(float32(0))}, 536 {"Float64", reflect.TypeOf(float64(0))}, 537 {"ByteArray", reflect.TypeOf(parquet.ByteArray{})}, 538 {"FixedLenByteArray", reflect.TypeOf(parquet.FixedLenByteArray{})}, 539 } 540 541 for _, tt := range tests { 542 t.Run(tt.name, func(t *testing.T) { 543 suite.Run(t, &DictionaryEncodingTestSuite{BaseEncodingTestSuite{typ: tt.typ}}) 544 }) 545 } 546 } 547 548 func TestWriteDeltaBitPackedInt32(t *testing.T) { 549 column := schema.NewColumn(schema.NewInt32Node("int32", parquet.Repetitions.Required, -1), 0, 0) 550 551 tests := []struct { 552 name string 553 toencode []int32 554 expected []byte 555 }{ 556 {"simple 12345", []int32{1, 2, 3, 4, 5}, []byte{128, 1, 4, 5, 2, 2, 0, 0, 0, 0}}, 557 {"odd vals", []int32{7, 5, 3, 1, 2, 3, 4, 5}, []byte{128, 1, 4, 8, 14, 3, 2, 0, 0, 0, 192, 63, 0, 0, 0, 0, 0, 0}}, 558 } 559 560 for _, tt := range tests { 561 t.Run(tt.name, func(t *testing.T) { 562 enc := encoding.NewEncoder(parquet.Types.Int32, parquet.Encodings.DeltaBinaryPacked, false, column, memory.DefaultAllocator) 563 564 enc.(encoding.Int32Encoder).Put(tt.toencode) 565 buf, _ := enc.FlushValues() 566 defer buf.Release() 567 568 assert.Equal(t, tt.expected, buf.Bytes()) 569 570 dec := encoding.NewDecoder(parquet.Types.Int32, parquet.Encodings.DeltaBinaryPacked, column, memory.DefaultAllocator) 571 572 dec.(encoding.Int32Decoder).SetData(len(tt.toencode), tt.expected) 573 out := make([]int32, len(tt.toencode)) 574 dec.(encoding.Int32Decoder).Decode(out) 575 assert.Equal(t, tt.toencode, out) 576 }) 577 } 578 579 t.Run("test progressive decoding", func(t *testing.T) { 580 values := make([]int32, 1000) 581 testutils.FillRandomInt32(0, values) 582 583 enc := encoding.NewEncoder(parquet.Types.Int32, parquet.Encodings.DeltaBinaryPacked, false, column, memory.DefaultAllocator) 584 enc.(encoding.Int32Encoder).Put(values) 585 buf, _ := enc.FlushValues() 586 defer buf.Release() 587 588 dec := encoding.NewDecoder(parquet.Types.Int32, parquet.Encodings.DeltaBinaryPacked, column, memory.DefaultAllocator) 589 dec.(encoding.Int32Decoder).SetData(len(values), buf.Bytes()) 590 591 valueBuf := make([]int32, 100) 592 for i, j := 0, len(valueBuf); j <= len(values); i, j = i+len(valueBuf), j+len(valueBuf) { 593 dec.(encoding.Int32Decoder).Decode(valueBuf) 594 assert.Equalf(t, values[i:j], valueBuf, "indexes %d:%d", i, j) 595 } 596 }) 597 } 598 599 func TestWriteDeltaBitPackedInt64(t *testing.T) { 600 column := schema.NewColumn(schema.NewInt64Node("int64", parquet.Repetitions.Required, -1), 0, 0) 601 602 tests := []struct { 603 name string 604 toencode []int64 605 expected []byte 606 }{ 607 {"simple 12345", []int64{1, 2, 3, 4, 5}, []byte{128, 1, 4, 5, 2, 2, 0, 0, 0, 0}}, 608 {"odd vals", []int64{7, 5, 3, 1, 2, 3, 4, 5}, []byte{128, 1, 4, 8, 14, 3, 2, 0, 0, 0, 192, 63, 0, 0, 0, 0, 0, 0}}, 609 } 610 611 for _, tt := range tests { 612 t.Run(tt.name, func(t *testing.T) { 613 enc := encoding.NewEncoder(parquet.Types.Int64, parquet.Encodings.DeltaBinaryPacked, false, column, memory.DefaultAllocator) 614 615 enc.(encoding.Int64Encoder).Put(tt.toencode) 616 buf, _ := enc.FlushValues() 617 defer buf.Release() 618 619 assert.Equal(t, tt.expected, buf.Bytes()) 620 621 dec := encoding.NewDecoder(parquet.Types.Int64, parquet.Encodings.DeltaBinaryPacked, column, memory.DefaultAllocator) 622 623 dec.(encoding.Int64Decoder).SetData(len(tt.toencode), tt.expected) 624 out := make([]int64, len(tt.toencode)) 625 dec.(encoding.Int64Decoder).Decode(out) 626 assert.Equal(t, tt.toencode, out) 627 }) 628 } 629 630 t.Run("test progressive decoding", func(t *testing.T) { 631 values := make([]int64, 1000) 632 testutils.FillRandomInt64(0, values) 633 634 enc := encoding.NewEncoder(parquet.Types.Int64, parquet.Encodings.DeltaBinaryPacked, false, column, memory.DefaultAllocator) 635 enc.(encoding.Int64Encoder).Put(values) 636 buf, _ := enc.FlushValues() 637 defer buf.Release() 638 639 dec := encoding.NewDecoder(parquet.Types.Int64, parquet.Encodings.DeltaBinaryPacked, column, memory.DefaultAllocator) 640 dec.(encoding.Int64Decoder).SetData(len(values), buf.Bytes()) 641 642 valueBuf := make([]int64, 100) 643 for i, j := 0, len(valueBuf); j <= len(values); i, j = i+len(valueBuf), j+len(valueBuf) { 644 decoded, _ := dec.(encoding.Int64Decoder).Decode(valueBuf) 645 assert.Equal(t, len(valueBuf), decoded) 646 assert.Equalf(t, values[i:j], valueBuf, "indexes %d:%d", i, j) 647 } 648 }) 649 650 t.Run("GH-37102", func(t *testing.T) { 651 values := []int64{ 652 0, 3000000000000000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 653 0, 3000000000000000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 654 0, 3000000000000000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 655 0, 3000000000000000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 656 0, 0, 657 } 658 659 enc := encoding.NewEncoder(parquet.Types.Int64, parquet.Encodings.DeltaBinaryPacked, false, column, memory.DefaultAllocator) 660 enc.(encoding.Int64Encoder).Put(values) 661 buf, _ := enc.FlushValues() 662 defer buf.Release() 663 664 dec := encoding.NewDecoder(parquet.Types.Int64, parquet.Encodings.DeltaBinaryPacked, column, memory.DefaultAllocator) 665 dec.(encoding.Int64Decoder).SetData(len(values), buf.Bytes()) 666 667 valueBuf := make([]int64, len(values)) 668 669 decoded, _ := dec.(encoding.Int64Decoder).Decode(valueBuf) 670 assert.Equal(t, len(valueBuf), decoded) 671 assert.Equal(t, values, valueBuf) 672 }) 673 } 674 675 func TestDeltaLengthByteArrayEncoding(t *testing.T) { 676 column := schema.NewColumn(schema.NewByteArrayNode("bytearray", parquet.Repetitions.Required, -1), 0, 0) 677 678 test := []parquet.ByteArray{[]byte("Hello"), []byte("World"), []byte("Foobar"), []byte("ABCDEF")} 679 expected := []byte{128, 1, 4, 4, 10, 0, 1, 0, 0, 0, 2, 0, 0, 0, 72, 101, 108, 108, 111, 87, 111, 114, 108, 100, 70, 111, 111, 98, 97, 114, 65, 66, 67, 68, 69, 70} 680 681 enc := encoding.NewEncoder(parquet.Types.ByteArray, parquet.Encodings.DeltaLengthByteArray, false, column, memory.DefaultAllocator) 682 enc.(encoding.ByteArrayEncoder).Put(test) 683 buf, _ := enc.FlushValues() 684 defer buf.Release() 685 686 assert.Equal(t, expected, buf.Bytes()) 687 688 dec := encoding.NewDecoder(parquet.Types.ByteArray, parquet.Encodings.DeltaLengthByteArray, column, nil) 689 dec.SetData(len(test), expected) 690 out := make([]parquet.ByteArray, len(test)) 691 decoded, _ := dec.(encoding.ByteArrayDecoder).Decode(out) 692 assert.Equal(t, len(test), decoded) 693 assert.Equal(t, test, out) 694 } 695 696 func TestDeltaByteArrayEncoding(t *testing.T) { 697 test := []parquet.ByteArray{[]byte("Hello"), []byte("World"), []byte("Foobar"), []byte("ABCDEF")} 698 expected := []byte{128, 1, 4, 4, 0, 0, 0, 0, 0, 0, 128, 1, 4, 4, 10, 0, 1, 0, 0, 0, 2, 0, 0, 0, 72, 101, 108, 108, 111, 87, 111, 114, 108, 100, 70, 111, 111, 98, 97, 114, 65, 66, 67, 68, 69, 70} 699 700 enc := encoding.NewEncoder(parquet.Types.ByteArray, parquet.Encodings.DeltaByteArray, false, nil, nil) 701 enc.(encoding.ByteArrayEncoder).Put(test) 702 buf, _ := enc.FlushValues() 703 defer buf.Release() 704 705 assert.Equal(t, expected, buf.Bytes()) 706 707 dec := encoding.NewDecoder(parquet.Types.ByteArray, parquet.Encodings.DeltaByteArray, nil, nil) 708 dec.SetData(len(test), expected) 709 out := make([]parquet.ByteArray, len(test)) 710 decoded, _ := dec.(encoding.ByteArrayDecoder).Decode(out) 711 assert.Equal(t, len(test), decoded) 712 assert.Equal(t, test, out) 713 } 714 715 func TestDeltaBitPacking(t *testing.T) { 716 datadir := os.Getenv("ARROW_TEST_DATA") 717 if datadir == "" { 718 return 719 } 720 721 fname := path.Join(datadir, "parquet/timestamp.data") 722 require.FileExists(t, fname) 723 f, err := os.Open(fname) 724 if err != nil { 725 t.Fatal(err) 726 } 727 defer f.Close() 728 729 values := make([]int64, 0) 730 731 scanner := bufio.NewScanner(f) 732 for scanner.Scan() { 733 v, err := strconv.ParseInt(scanner.Text(), 10, 64) 734 if err != nil { 735 t.Fatal(err) 736 } 737 values = append(values, v) 738 } 739 740 if err := scanner.Err(); err != nil { 741 t.Fatal(err) 742 } 743 744 col := schema.NewColumn(schema.MustPrimitive(schema.NewPrimitiveNode("foo", parquet.Repetitions.Required, 745 parquet.Types.Int64, -1, -1)), 0, 0) 746 enc := encoding.NewEncoder(parquet.Types.Int64, parquet.Encodings.DeltaBinaryPacked, false, col, memory.DefaultAllocator).(encoding.Int64Encoder) 747 748 enc.Put(values) 749 buf, err := enc.FlushValues() 750 if err != nil { 751 t.Fatal(err) 752 } 753 defer buf.Release() 754 755 dec := encoding.NewDecoder(parquet.Types.Int64, parquet.Encodings.DeltaBinaryPacked, col, memory.DefaultAllocator).(encoding.Int64Decoder) 756 dec.SetData(len(values), buf.Bytes()) 757 758 ll := len(values) 759 for i := 0; i < ll; i += 1024 { 760 out := make([]int64, 1024) 761 n, err := dec.Decode(out) 762 if err != nil { 763 t.Fatal(err) 764 } 765 assert.Equal(t, values[:n], out[:n]) 766 values = values[n:] 767 } 768 assert.Equal(t, dec.ValuesLeft(), 0) 769 } 770 771 func TestBooleanPlainDecoderAfterFlushing(t *testing.T) { 772 descr := schema.NewColumn(schema.NewBooleanNode("bool", parquet.Repetitions.Optional, -1), 0, 0) 773 enc := encoding.NewEncoder(parquet.Types.Boolean, parquet.Encodings.Plain, false, descr, memory.DefaultAllocator) 774 benc := enc.(encoding.BooleanEncoder) 775 776 dec := encoding.NewDecoder(parquet.Types.Boolean, parquet.Encodings.Plain, descr, memory.DefaultAllocator) 777 decSlice := make([]bool, 1) 778 bdec := dec.(encoding.BooleanDecoder) 779 780 // Write and extract two different values 781 // This is validating that `FlushValues` wholly 782 // resets the encoder state. 783 benc.Put([]bool{true}) 784 buf1, err := benc.FlushValues() 785 assert.NoError(t, err) 786 787 benc.Put([]bool{false}) 788 buf2, err := benc.FlushValues() 789 assert.NoError(t, err) 790 791 // Decode buf1, expect true 792 err = bdec.SetData(1, buf1.Buf()) 793 assert.NoError(t, err) 794 n, err := bdec.Decode(decSlice) 795 assert.NoError(t, err) 796 assert.Equal(t, n, 1) 797 assert.Equal(t, decSlice[0], true) 798 799 // Decode buf2, expect false 800 err = bdec.SetData(1, buf2.Buf()) 801 assert.NoError(t, err) 802 n, err = bdec.Decode(decSlice) 803 assert.NoError(t, err) 804 assert.Equal(t, n, 1) 805 assert.Equal(t, decSlice[0], false) 806 }