github.com/apache/arrow/go/v14@v14.0.1/parquet/metadata/stat_compare_test.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package metadata
    18  
    19  import (
    20  	"encoding/binary"
    21  	"testing"
    22  
    23  	"github.com/apache/arrow/go/v14/parquet"
    24  	"github.com/apache/arrow/go/v14/parquet/schema"
    25  	"github.com/stretchr/testify/assert"
    26  	"github.com/stretchr/testify/require"
    27  )
    28  
    29  func TestSignedByteArrayCompare(t *testing.T) {
    30  	s := ByteArrayStatistics{
    31  		statistics: statistics{
    32  			order: schema.SortSIGNED,
    33  		},
    34  	}
    35  
    36  	// signed byte array comparison is only used for Decimal comparison.
    37  	// when decimals are encoded as byte arrays they use twos compliment
    38  	// big-endian encoded values. Comparisons of byte arrays of unequal
    39  	// types need to handle sign extension.
    40  
    41  	tests := []struct {
    42  		b     []byte
    43  		order int
    44  	}{
    45  		{[]byte{0x80, 0x80, 0, 0}, 0},
    46  		{[]byte{ /*0xFF,*/ 0x80, 0, 0}, 1},
    47  		{[]byte{0xFF, 0x80, 0, 0}, 1},
    48  		{[]byte{ /*0xFF,*/ 0xFF, 0x01, 0}, 2},
    49  		{[]byte{ /*0xFF, 0xFF,*/ 0x80, 0}, 3},
    50  		{[]byte{ /*0xFF,*/ 0xFF, 0x80, 0}, 3},
    51  		{[]byte{0xFF, 0xFF, 0x80, 0}, 3},
    52  		{[]byte{ /*0xFF,0xFF,0xFF,*/ 0x80}, 4},
    53  		{[]byte{ /*0xFF,0xFF,0xFF*/ 0xFF}, 5},
    54  		{[]byte{ /*0, 0,*/ 0x01, 0x01}, 6},
    55  		{[]byte{ /*0,*/ 0, 0x01, 0x01}, 6},
    56  		{[]byte{0, 0, 0x01, 0x01}, 6},
    57  		{[]byte{ /*0,*/ 0x01, 0x01, 0}, 7},
    58  		{[]byte{0x01, 0x01, 0, 0}, 8},
    59  	}
    60  
    61  	for i, tt := range tests {
    62  		// empty array is always the smallest
    63  		assert.Truef(t, s.less(parquet.ByteArray{}, parquet.ByteArray(tt.b)), "case: %d", i)
    64  		assert.Falsef(t, s.less(parquet.ByteArray(tt.b), parquet.ByteArray{}), "case: %d", i)
    65  		// equals is always false
    66  		assert.Falsef(t, s.less(parquet.ByteArray(tt.b), parquet.ByteArray(tt.b)), "case: %d", i)
    67  
    68  		for j, case2 := range tests {
    69  			var fn func(assert.TestingT, bool, string, ...interface{}) bool
    70  			if tt.order < case2.order {
    71  				fn = assert.Truef
    72  			} else {
    73  				fn = assert.Falsef
    74  			}
    75  			fn(t, s.less(parquet.ByteArray(tt.b), parquet.ByteArray(case2.b)),
    76  				"%d (order: %d) %d (order: %d)", i, tt.order, j, case2.order)
    77  		}
    78  	}
    79  }
    80  
    81  func TestUnsignedByteArrayCompare(t *testing.T) {
    82  	s := ByteArrayStatistics{
    83  		statistics: statistics{
    84  			order: schema.SortUNSIGNED,
    85  		},
    86  	}
    87  
    88  	s1ba := parquet.ByteArray("arrange")
    89  	s2ba := parquet.ByteArray("arrangement")
    90  	assert.True(t, s.less(s1ba, s2ba))
    91  
    92  	// multi-byte utf-8 characters
    93  	s1ba = parquet.ByteArray("braten")
    94  	s2ba = parquet.ByteArray("bügeln")
    95  	assert.True(t, s.less(s1ba, s2ba))
    96  
    97  	s1ba = parquet.ByteArray("ünk123456") // ü = 252
    98  	s2ba = parquet.ByteArray("ănk123456") // ă = 259
    99  	assert.True(t, s.less(s1ba, s2ba))
   100  }
   101  
   102  func TestSignedCompareFLBA(t *testing.T) {
   103  	s := FixedLenByteArrayStatistics{
   104  		statistics: statistics{order: schema.SortSIGNED},
   105  	}
   106  
   107  	values := []parquet.FixedLenByteArray{
   108  		[]byte{0x80, 0, 0, 0},
   109  		[]byte{0xFF, 0xFF, 0x01, 0},
   110  		[]byte{0xFF, 0xFF, 0x80, 0},
   111  		[]byte{0xFF, 0xFF, 0xFF, 0x80},
   112  		[]byte{0xFF, 0xFF, 0xFF, 0xFF},
   113  		[]byte{0, 0, 0x01, 0x01},
   114  		[]byte{0, 0x01, 0x01, 0},
   115  		[]byte{0x01, 0x01, 0, 0},
   116  	}
   117  
   118  	for i, v := range values {
   119  		assert.Falsef(t, s.less(v, v), "%d", i)
   120  		for j, v2 := range values[i+1:] {
   121  			assert.Truef(t, s.less(v, v2), "%d %d", i, j)
   122  			assert.Falsef(t, s.less(v2, v), "%d %d", j, i)
   123  		}
   124  	}
   125  }
   126  
   127  func TestUnsignedCompareFLBA(t *testing.T) {
   128  	s := FixedLenByteArrayStatistics{
   129  		statistics: statistics{order: schema.SortUNSIGNED},
   130  	}
   131  
   132  	s1flba := parquet.FixedLenByteArray("Anti123456")
   133  	s2flba := parquet.FixedLenByteArray("Bunkd123456")
   134  	assert.True(t, s.less(s1flba, s2flba))
   135  
   136  	s1flba = parquet.FixedLenByteArray("Bunk123456")
   137  	s2flba = parquet.FixedLenByteArray("Bünk123456")
   138  	assert.True(t, s.less(s1flba, s2flba))
   139  }
   140  
   141  func TestSignedCompareInt96(t *testing.T) {
   142  	s := Int96Statistics{
   143  		statistics: statistics{order: schema.SortSIGNED},
   144  	}
   145  
   146  	val := -14
   147  
   148  	var (
   149  		a   = parquet.NewInt96([3]uint32{1, 41, 14})
   150  		b   = parquet.NewInt96([3]uint32{1, 41, 42})
   151  		aa  = parquet.NewInt96([3]uint32{1, 41, 14})
   152  		bb  = parquet.NewInt96([3]uint32{1, 41, 14})
   153  		aaa = parquet.NewInt96([3]uint32{1, 41, uint32(val)})
   154  		bbb = parquet.NewInt96([3]uint32{1, 41, 42})
   155  	)
   156  
   157  	assert.True(t, s.less(a, b))
   158  	assert.True(t, !s.less(aa, bb) && !s.less(bb, aa))
   159  	assert.True(t, s.less(aaa, bbb))
   160  }
   161  
   162  func TestUnsignedCompareInt96(t *testing.T) {
   163  	s := Int96Statistics{
   164  		statistics: statistics{order: schema.SortUNSIGNED},
   165  	}
   166  
   167  	valb := -41
   168  	valbb := -14
   169  
   170  	var (
   171  		a   = parquet.NewInt96([3]uint32{1, 41, 14})
   172  		b   = parquet.NewInt96([3]uint32{1, uint32(valb), 42})
   173  		aa  = parquet.NewInt96([3]uint32{1, 41, 14})
   174  		bb  = parquet.NewInt96([3]uint32{1, 41, uint32(valbb)})
   175  		aaa parquet.Int96
   176  		bbb parquet.Int96
   177  	)
   178  
   179  	assert.True(t, s.less(a, b))
   180  	assert.True(t, s.less(aa, bb))
   181  
   182  	binary.LittleEndian.PutUint32(aaa[8:], 2451545) // 2000-01-01
   183  	binary.LittleEndian.PutUint32(bbb[8:], 2451546) // 2000-01-02
   184  	// 12 hours + 34 minutes + 56 seconds
   185  	aaa.SetNanoSeconds(45296000000000)
   186  	// 12 hours + 34 minutes + 50 seconds
   187  	bbb.SetNanoSeconds(45290000000000)
   188  	assert.True(t, s.less(aaa, bbb))
   189  
   190  	binary.LittleEndian.PutUint32(aaa[8:], 2451545) // 2000-01-01
   191  	binary.LittleEndian.PutUint32(bbb[8:], 2451545) // 2000-01-01
   192  	// 11 hours + 34 minutes + 56 seconds
   193  	aaa.SetNanoSeconds(41696000000000)
   194  	// 12 hours + 34 minutes + 50 seconds
   195  	bbb.SetNanoSeconds(45290000000000)
   196  	assert.True(t, s.less(aaa, bbb))
   197  
   198  	binary.LittleEndian.PutUint32(aaa[8:], 2451545) // 2000-01-01
   199  	binary.LittleEndian.PutUint32(bbb[8:], 2451545) // 2000-01-01
   200  	// 12 hours + 34 minutes + 55 seconds
   201  	aaa.SetNanoSeconds(45295000000000)
   202  	// 12 hours + 34 minutes + 56 seconds
   203  	bbb.SetNanoSeconds(45296000000000)
   204  	assert.True(t, s.less(aaa, bbb))
   205  }
   206  
   207  func TestCompareSignedInt64(t *testing.T) {
   208  	var (
   209  		a   int64 = 1
   210  		b   int64 = 4
   211  		aa  int64 = 1
   212  		bb  int64 = 1
   213  		aaa int64 = -1
   214  		bbb int64 = 1
   215  	)
   216  
   217  	n := schema.NewInt64Node("signedint64", parquet.Repetitions.Required, -1)
   218  	descr := schema.NewColumn(n, 0, 0)
   219  	s := NewStatistics(descr, nil).(*Int64Statistics)
   220  
   221  	assert.True(t, s.less(a, b))
   222  	assert.True(t, !s.less(aa, bb) && !s.less(bb, aa))
   223  	assert.True(t, s.less(aaa, bbb))
   224  }
   225  
   226  func TestCompareUnsignedInt64(t *testing.T) {
   227  	var (
   228  		a   int64 = 1
   229  		b   int64 = 4
   230  		aa  int64 = 1
   231  		bb  int64 = 1
   232  		aaa int64 = 1
   233  		bbb int64 = -1
   234  	)
   235  
   236  	n, err := schema.NewPrimitiveNodeConverted("unsigned int64", parquet.Repetitions.Required, parquet.Types.Int64, schema.ConvertedTypes.Uint64, 0, 0, 0, 0)
   237  	require.NoError(t, err)
   238  	descr := schema.NewColumn(n, 0, 0)
   239  
   240  	assert.Equal(t, schema.SortUNSIGNED, descr.SortOrder())
   241  	s := NewStatistics(descr, nil).(*Int64Statistics)
   242  
   243  	assert.True(t, s.less(a, b))
   244  	assert.True(t, !s.less(aa, bb) && !s.less(bb, aa))
   245  	assert.True(t, s.less(aaa, bbb))
   246  }
   247  
   248  func TestCompareUnsignedInt32(t *testing.T) {
   249  	var (
   250  		a   int32 = 1
   251  		b   int32 = 4
   252  		aa  int32 = 1
   253  		bb  int32 = 1
   254  		aaa int32 = 1
   255  		bbb int32 = -1
   256  	)
   257  
   258  	n, err := schema.NewPrimitiveNodeConverted("unsigned int32", parquet.Repetitions.Required, parquet.Types.Int32, schema.ConvertedTypes.Uint32, 0, 0, 0, 0)
   259  	require.NoError(t, err)
   260  	descr := schema.NewColumn(n, 0, 0)
   261  
   262  	assert.Equal(t, schema.SortUNSIGNED, descr.SortOrder())
   263  	s := NewStatistics(descr, nil).(*Int32Statistics)
   264  
   265  	assert.True(t, s.less(a, b))
   266  	assert.True(t, !s.less(aa, bb) && !s.less(bb, aa))
   267  	assert.True(t, s.less(aaa, bbb))
   268  }