github.com/apache/arrow/go/v16@v16.1.0/parquet/metadata/statistics_test.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package metadata_test 18 19 import ( 20 "math" 21 "reflect" 22 "testing" 23 24 "github.com/apache/arrow/go/v16/arrow/bitutil" 25 "github.com/apache/arrow/go/v16/arrow/float16" 26 "github.com/apache/arrow/go/v16/arrow/memory" 27 "github.com/apache/arrow/go/v16/parquet" 28 "github.com/apache/arrow/go/v16/parquet/metadata" 29 "github.com/apache/arrow/go/v16/parquet/schema" 30 "github.com/stretchr/testify/assert" 31 ) 32 33 // NOTE(zeroshade): tests will be added and updated after merging the "file" package 34 // since the tests that I wrote relied on the file writer/reader for ease of use. 35 36 func newFloat16Node(name string, rep parquet.Repetition, fieldID int32) *schema.PrimitiveNode { 37 return schema.MustPrimitive(schema.NewPrimitiveNodeLogical(name, rep, schema.Float16LogicalType{}, parquet.Types.FixedLenByteArray, 2, fieldID)) 38 } 39 40 func TestCheckNaNs(t *testing.T) { 41 const ( 42 numvals = 8 43 min = -4.0 44 max = 3.0 45 ) 46 var ( 47 nan = math.NaN() 48 f16Min parquet.FixedLenByteArray = float16.New(float32(min)).ToLEBytes() 49 f16Max parquet.FixedLenByteArray = float16.New(float32(max)).ToLEBytes() 50 ) 51 52 allNans := []float64{nan, nan, nan, nan, nan, nan, nan, nan} 53 allNansf32 := make([]float32, numvals) 54 allNansf16 := make([]parquet.FixedLenByteArray, numvals) 55 for idx, v := range allNans { 56 allNansf32[idx] = float32(v) 57 allNansf16[idx] = float16.New(float32(v)).ToLEBytes() 58 } 59 60 someNans := []float64{nan, max, -3.0, -1.0, nan, 2.0, min, nan} 61 someNansf32 := make([]float32, numvals) 62 someNansf16 := make([]parquet.FixedLenByteArray, numvals) 63 for idx, v := range someNans { 64 someNansf32[idx] = float32(v) 65 someNansf16[idx] = float16.New(float32(v)).ToLEBytes() 66 } 67 68 validBitmap := []byte{0x7F} // 0b01111111 69 validBitmapNoNaNs := []byte{0x6E} // 0b01101110 70 71 assertUnsetMinMax := func(stats metadata.TypedStatistics, values interface{}, bitmap []byte) { 72 if bitmap == nil { 73 switch s := stats.(type) { 74 case *metadata.Float32Statistics: 75 s.Update(values.([]float32), 0) 76 case *metadata.Float64Statistics: 77 s.Update(values.([]float64), 0) 78 case *metadata.Float16Statistics: 79 s.Update(values.([]parquet.FixedLenByteArray), 0) 80 } 81 assert.False(t, stats.HasMinMax()) 82 } else { 83 nvalues := reflect.ValueOf(values).Len() 84 nullCount := bitutil.CountSetBits(bitmap, 0, nvalues) 85 switch s := stats.(type) { 86 case *metadata.Float32Statistics: 87 s.UpdateSpaced(values.([]float32), bitmap, 0, int64(nullCount)) 88 case *metadata.Float64Statistics: 89 s.UpdateSpaced(values.([]float64), bitmap, 0, int64(nullCount)) 90 case *metadata.Float16Statistics: 91 s.UpdateSpaced(values.([]parquet.FixedLenByteArray), bitmap, 0, int64(nullCount)) 92 } 93 assert.False(t, stats.HasMinMax()) 94 } 95 } 96 97 assertMinMaxAre := func(stats metadata.TypedStatistics, values interface{}, expectedMin, expectedMax interface{}) { 98 switch s := stats.(type) { 99 case *metadata.Float32Statistics: 100 s.Update(values.([]float32), 0) 101 assert.True(t, stats.HasMinMax()) 102 assert.Equal(t, expectedMin, s.Min()) 103 assert.Equal(t, expectedMax, s.Max()) 104 case *metadata.Float64Statistics: 105 s.Update(values.([]float64), 0) 106 assert.True(t, stats.HasMinMax()) 107 assert.Equal(t, expectedMin, s.Min()) 108 assert.Equal(t, expectedMax, s.Max()) 109 case *metadata.Float16Statistics: 110 s.Update(values.([]parquet.FixedLenByteArray), 0) 111 assert.True(t, stats.HasMinMax()) 112 assert.Equal(t, expectedMin, s.Min()) 113 assert.Equal(t, expectedMax, s.Max()) 114 } 115 } 116 117 assertMinMaxAreSpaced := func(stats metadata.TypedStatistics, values interface{}, bitmap []byte, expectedMin, expectedMax interface{}) { 118 nvalues := reflect.ValueOf(values).Len() 119 nullCount := bitutil.CountSetBits(bitmap, 0, nvalues) 120 switch s := stats.(type) { 121 case *metadata.Float32Statistics: 122 s.UpdateSpaced(values.([]float32), bitmap, 0, int64(nullCount)) 123 assert.True(t, s.HasMinMax()) 124 assert.Equal(t, expectedMin, s.Min()) 125 assert.Equal(t, expectedMax, s.Max()) 126 case *metadata.Float64Statistics: 127 s.UpdateSpaced(values.([]float64), bitmap, 0, int64(nullCount)) 128 assert.True(t, s.HasMinMax()) 129 assert.Equal(t, expectedMin, s.Min()) 130 assert.Equal(t, expectedMax, s.Max()) 131 case *metadata.Float16Statistics: 132 s.UpdateSpaced(values.([]parquet.FixedLenByteArray), bitmap, 0, int64(nullCount)) 133 assert.True(t, s.HasMinMax()) 134 assert.Equal(t, expectedMin, s.Min()) 135 assert.Equal(t, expectedMax, s.Max()) 136 } 137 } 138 139 f32Col := schema.NewColumn(schema.NewFloat32Node("f", parquet.Repetitions.Optional, -1), 1, 1) 140 f64Col := schema.NewColumn(schema.NewFloat64Node("f", parquet.Repetitions.Optional, -1), 1, 1) 141 f16Col := schema.NewColumn(newFloat16Node("f", parquet.Repetitions.Required, -1), 1, 1) 142 // test values 143 someNanStats := metadata.NewStatistics(f64Col, memory.DefaultAllocator) 144 someNanStatsf32 := metadata.NewStatistics(f32Col, memory.DefaultAllocator) 145 someNanStatsf16 := metadata.NewStatistics(f16Col, memory.DefaultAllocator) 146 // ingesting only nans should not yield a min or max 147 assertUnsetMinMax(someNanStats, allNans, nil) 148 assertUnsetMinMax(someNanStatsf32, allNansf32, nil) 149 assertUnsetMinMax(someNanStatsf16, allNansf16, nil) 150 // ingesting a mix should yield a valid min/max 151 assertMinMaxAre(someNanStats, someNans, min, max) 152 assertMinMaxAre(someNanStatsf32, someNansf32, float32(min), float32(max)) 153 assertMinMaxAre(someNanStatsf16, someNansf16, f16Min, f16Max) 154 // ingesting only nans after a valid min/max should have no effect 155 assertMinMaxAre(someNanStats, allNans, min, max) 156 assertMinMaxAre(someNanStatsf32, allNansf32, float32(min), float32(max)) 157 assertMinMaxAre(someNanStatsf16, allNansf16, f16Min, f16Max) 158 159 someNanStats = metadata.NewStatistics(f64Col, memory.DefaultAllocator) 160 someNanStatsf32 = metadata.NewStatistics(f32Col, memory.DefaultAllocator) 161 someNanStatsf16 = metadata.NewStatistics(f16Col, memory.DefaultAllocator) 162 assertUnsetMinMax(someNanStats, allNans, validBitmap) 163 assertUnsetMinMax(someNanStatsf32, allNansf32, validBitmap) 164 assertUnsetMinMax(someNanStatsf16, allNansf16, validBitmap) 165 // nans should not pollute min/max when excluded via null bitmap 166 assertMinMaxAreSpaced(someNanStats, someNans, validBitmapNoNaNs, min, max) 167 assertMinMaxAreSpaced(someNanStatsf32, someNansf32, validBitmapNoNaNs, float32(min), float32(max)) 168 assertMinMaxAreSpaced(someNanStatsf16, someNansf16, validBitmapNoNaNs, f16Min, f16Max) 169 // ingesting nans with a null bitmap should not change the result 170 assertMinMaxAreSpaced(someNanStats, someNans, validBitmap, min, max) 171 assertMinMaxAreSpaced(someNanStatsf32, someNansf32, validBitmap, float32(min), float32(max)) 172 assertMinMaxAreSpaced(someNanStatsf16, someNansf16, validBitmap, f16Min, f16Max) 173 } 174 175 func TestCheckNegativeZeroStats(t *testing.T) { 176 assertMinMaxZeroesSign := func(stats metadata.TypedStatistics, values interface{}) { 177 switch s := stats.(type) { 178 case *metadata.Float32Statistics: 179 s.Update(values.([]float32), 0) 180 assert.True(t, s.HasMinMax()) 181 var zero float32 182 assert.Equal(t, zero, s.Min()) 183 assert.True(t, math.Signbit(float64(s.Min()))) 184 assert.Equal(t, zero, s.Max()) 185 assert.False(t, math.Signbit(float64(s.Max()))) 186 case *metadata.Float64Statistics: 187 s.Update(values.([]float64), 0) 188 assert.True(t, s.HasMinMax()) 189 var zero float64 190 assert.Equal(t, zero, s.Min()) 191 assert.True(t, math.Signbit(s.Min())) 192 assert.Equal(t, zero, s.Max()) 193 assert.False(t, math.Signbit(s.Max())) 194 case *metadata.Float16Statistics: 195 s.Update(values.([]parquet.FixedLenByteArray), 0) 196 assert.True(t, s.HasMinMax()) 197 var zero float64 198 min := float64(float16.FromLEBytes(s.Min()).Float32()) 199 max := float64(float16.FromLEBytes(s.Max()).Float32()) 200 assert.Equal(t, zero, min) 201 assert.True(t, math.Signbit(min)) 202 assert.Equal(t, zero, max) 203 assert.False(t, math.Signbit(max)) 204 } 205 } 206 207 fcol := schema.NewColumn(schema.NewFloat32Node("f", parquet.Repetitions.Optional, -1), 1, 1) 208 dcol := schema.NewColumn(schema.NewFloat64Node("d", parquet.Repetitions.Optional, -1), 1, 1) 209 hcol := schema.NewColumn(newFloat16Node("h", parquet.Repetitions.Optional, -1), 1, 1) 210 211 var f32zero float32 212 var f64zero float64 213 var f16PosZero parquet.FixedLenByteArray = float16.New(+f32zero).ToLEBytes() 214 var f16NegZero parquet.FixedLenByteArray = float16.New(-f32zero).ToLEBytes() 215 216 assert.False(t, float16.FromLEBytes(f16PosZero).Signbit()) 217 assert.True(t, float16.FromLEBytes(f16NegZero).Signbit()) 218 { 219 fstats := metadata.NewStatistics(fcol, memory.DefaultAllocator) 220 dstats := metadata.NewStatistics(dcol, memory.DefaultAllocator) 221 hstats := metadata.NewStatistics(hcol, memory.DefaultAllocator) 222 assertMinMaxZeroesSign(fstats, []float32{-f32zero, f32zero}) 223 assertMinMaxZeroesSign(dstats, []float64{-f64zero, f64zero}) 224 assertMinMaxZeroesSign(hstats, []parquet.FixedLenByteArray{f16NegZero, f16PosZero}) 225 } 226 { 227 fstats := metadata.NewStatistics(fcol, memory.DefaultAllocator) 228 dstats := metadata.NewStatistics(dcol, memory.DefaultAllocator) 229 hstats := metadata.NewStatistics(hcol, memory.DefaultAllocator) 230 assertMinMaxZeroesSign(fstats, []float32{f32zero, -f32zero}) 231 assertMinMaxZeroesSign(dstats, []float64{f64zero, -f64zero}) 232 assertMinMaxZeroesSign(hstats, []parquet.FixedLenByteArray{f16PosZero, f16NegZero}) 233 } 234 { 235 fstats := metadata.NewStatistics(fcol, memory.DefaultAllocator) 236 dstats := metadata.NewStatistics(dcol, memory.DefaultAllocator) 237 hstats := metadata.NewStatistics(hcol, memory.DefaultAllocator) 238 assertMinMaxZeroesSign(fstats, []float32{-f32zero, -f32zero}) 239 assertMinMaxZeroesSign(dstats, []float64{-f64zero, -f64zero}) 240 assertMinMaxZeroesSign(hstats, []parquet.FixedLenByteArray{f16NegZero, f16NegZero}) 241 } 242 { 243 fstats := metadata.NewStatistics(fcol, memory.DefaultAllocator) 244 dstats := metadata.NewStatistics(dcol, memory.DefaultAllocator) 245 hstats := metadata.NewStatistics(hcol, memory.DefaultAllocator) 246 assertMinMaxZeroesSign(fstats, []float32{f32zero, f32zero}) 247 assertMinMaxZeroesSign(dstats, []float64{f64zero, f64zero}) 248 assertMinMaxZeroesSign(hstats, []parquet.FixedLenByteArray{f16PosZero, f16PosZero}) 249 } 250 } 251 252 func TestBooleanStatisticsEncoding(t *testing.T) { 253 n := schema.NewBooleanNode("boolean", parquet.Repetitions.Required, -1) 254 descr := schema.NewColumn(n, 0, 0) 255 s := metadata.NewStatistics(descr, nil) 256 bs := s.(*metadata.BooleanStatistics) 257 bs.SetMinMax(false, true) 258 maxEnc := bs.EncodeMax() 259 minEnc := bs.EncodeMin() 260 assert.Equal(t, []byte{1}, maxEnc) 261 assert.Equal(t, []byte{0}, minEnc) 262 }