github.com/apache/arrow/go/v14@v14.0.2/parquet/metadata/statistics_test.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package metadata_test 18 19 import ( 20 "math" 21 "reflect" 22 "testing" 23 24 "github.com/apache/arrow/go/v14/arrow/bitutil" 25 "github.com/apache/arrow/go/v14/arrow/memory" 26 "github.com/apache/arrow/go/v14/parquet" 27 "github.com/apache/arrow/go/v14/parquet/metadata" 28 "github.com/apache/arrow/go/v14/parquet/schema" 29 "github.com/stretchr/testify/assert" 30 ) 31 32 // NOTE(zeroshade): tests will be added and updated after merging the "file" package 33 // since the tests that I wrote relied on the file writer/reader for ease of use. 34 35 func TestCheckNaNs(t *testing.T) { 36 const ( 37 numvals = 8 38 min = -4.0 39 max = 3.0 40 ) 41 nan := math.NaN() 42 43 allNans := []float64{nan, nan, nan, nan, nan, nan, nan, nan} 44 allNansf32 := make([]float32, numvals) 45 for idx, v := range allNans { 46 allNansf32[idx] = float32(v) 47 } 48 49 someNans := []float64{nan, max, -3.0, -1.0, nan, 2.0, min, nan} 50 someNansf32 := make([]float32, numvals) 51 for idx, v := range someNans { 52 someNansf32[idx] = float32(v) 53 } 54 55 validBitmap := []byte{0x7F} // 0b01111111 56 validBitmapNoNaNs := []byte{0x6E} // 0b01101110 57 58 assertUnsetMinMax := func(stats metadata.TypedStatistics, values interface{}, bitmap []byte) { 59 if bitmap == nil { 60 switch s := stats.(type) { 61 case *metadata.Float32Statistics: 62 s.Update(values.([]float32), 0) 63 case *metadata.Float64Statistics: 64 s.Update(values.([]float64), 0) 65 } 66 assert.False(t, stats.HasMinMax()) 67 } else { 68 nvalues := reflect.ValueOf(values).Len() 69 nullCount := bitutil.CountSetBits(bitmap, 0, nvalues) 70 switch s := stats.(type) { 71 case *metadata.Float32Statistics: 72 s.UpdateSpaced(values.([]float32), bitmap, 0, int64(nullCount)) 73 case *metadata.Float64Statistics: 74 s.UpdateSpaced(values.([]float64), bitmap, 0, int64(nullCount)) 75 } 76 assert.False(t, stats.HasMinMax()) 77 } 78 } 79 80 assertMinMaxAre := func(stats metadata.TypedStatistics, values interface{}, expectedMin, expectedMax interface{}) { 81 switch s := stats.(type) { 82 case *metadata.Float32Statistics: 83 s.Update(values.([]float32), 0) 84 assert.True(t, stats.HasMinMax()) 85 assert.Equal(t, expectedMin, s.Min()) 86 assert.Equal(t, expectedMax, s.Max()) 87 case *metadata.Float64Statistics: 88 s.Update(values.([]float64), 0) 89 assert.True(t, stats.HasMinMax()) 90 assert.Equal(t, expectedMin, s.Min()) 91 assert.Equal(t, expectedMax, s.Max()) 92 } 93 } 94 95 assertMinMaxAreSpaced := func(stats metadata.TypedStatistics, values interface{}, bitmap []byte, expectedMin, expectedMax interface{}) { 96 nvalues := reflect.ValueOf(values).Len() 97 nullCount := bitutil.CountSetBits(bitmap, 0, nvalues) 98 switch s := stats.(type) { 99 case *metadata.Float32Statistics: 100 s.UpdateSpaced(values.([]float32), bitmap, 0, int64(nullCount)) 101 assert.True(t, s.HasMinMax()) 102 assert.Equal(t, expectedMin, s.Min()) 103 assert.Equal(t, expectedMax, s.Max()) 104 case *metadata.Float64Statistics: 105 s.UpdateSpaced(values.([]float64), bitmap, 0, int64(nullCount)) 106 assert.True(t, s.HasMinMax()) 107 assert.Equal(t, expectedMin, s.Min()) 108 assert.Equal(t, expectedMax, s.Max()) 109 } 110 } 111 112 f32Col := schema.NewColumn(schema.NewFloat32Node("f", parquet.Repetitions.Optional, -1), 1, 1) 113 f64Col := schema.NewColumn(schema.NewFloat64Node("f", parquet.Repetitions.Optional, -1), 1, 1) 114 // test values 115 someNanStats := metadata.NewStatistics(f64Col, memory.DefaultAllocator) 116 someNanStatsf32 := metadata.NewStatistics(f32Col, memory.DefaultAllocator) 117 // ingesting only nans should not yield a min or max 118 assertUnsetMinMax(someNanStats, allNans, nil) 119 assertUnsetMinMax(someNanStatsf32, allNansf32, nil) 120 // ingesting a mix should yield a valid min/max 121 assertMinMaxAre(someNanStats, someNans, min, max) 122 assertMinMaxAre(someNanStatsf32, someNansf32, float32(min), float32(max)) 123 // ingesting only nans after a valid min/max should have no effect 124 assertMinMaxAre(someNanStats, allNans, min, max) 125 assertMinMaxAre(someNanStatsf32, allNansf32, float32(min), float32(max)) 126 127 someNanStats = metadata.NewStatistics(f64Col, memory.DefaultAllocator) 128 someNanStatsf32 = metadata.NewStatistics(f32Col, memory.DefaultAllocator) 129 assertUnsetMinMax(someNanStats, allNans, validBitmap) 130 assertUnsetMinMax(someNanStatsf32, allNansf32, validBitmap) 131 // nans should not pollute min/max when excluded via null bitmap 132 assertMinMaxAreSpaced(someNanStats, someNans, validBitmapNoNaNs, min, max) 133 assertMinMaxAreSpaced(someNanStatsf32, someNansf32, validBitmapNoNaNs, float32(min), float32(max)) 134 // ingesting nans with a null bitmap should not change the result 135 assertMinMaxAreSpaced(someNanStats, someNans, validBitmap, min, max) 136 assertMinMaxAreSpaced(someNanStatsf32, someNansf32, validBitmap, float32(min), float32(max)) 137 } 138 139 func TestCheckNegativeZeroStats(t *testing.T) { 140 assertMinMaxZeroesSign := func(stats metadata.TypedStatistics, values interface{}) { 141 switch s := stats.(type) { 142 case *metadata.Float32Statistics: 143 s.Update(values.([]float32), 0) 144 assert.True(t, s.HasMinMax()) 145 var zero float32 146 assert.Equal(t, zero, s.Min()) 147 assert.True(t, math.Signbit(float64(s.Min()))) 148 assert.Equal(t, zero, s.Max()) 149 assert.False(t, math.Signbit(float64(s.Max()))) 150 case *metadata.Float64Statistics: 151 s.Update(values.([]float64), 0) 152 assert.True(t, s.HasMinMax()) 153 var zero float64 154 assert.Equal(t, zero, s.Min()) 155 assert.True(t, math.Signbit(s.Min())) 156 assert.Equal(t, zero, s.Max()) 157 assert.False(t, math.Signbit(s.Max())) 158 } 159 } 160 161 fcol := schema.NewColumn(schema.NewFloat32Node("f", parquet.Repetitions.Optional, -1), 1, 1) 162 dcol := schema.NewColumn(schema.NewFloat64Node("d", parquet.Repetitions.Optional, -1), 1, 1) 163 164 var f32zero float32 165 var f64zero float64 166 { 167 fstats := metadata.NewStatistics(fcol, memory.DefaultAllocator) 168 dstats := metadata.NewStatistics(dcol, memory.DefaultAllocator) 169 assertMinMaxZeroesSign(fstats, []float32{-f32zero, f32zero}) 170 assertMinMaxZeroesSign(dstats, []float64{-f64zero, f64zero}) 171 } 172 { 173 fstats := metadata.NewStatistics(fcol, memory.DefaultAllocator) 174 dstats := metadata.NewStatistics(dcol, memory.DefaultAllocator) 175 assertMinMaxZeroesSign(fstats, []float32{f32zero, -f32zero}) 176 assertMinMaxZeroesSign(dstats, []float64{f64zero, -f64zero}) 177 } 178 { 179 fstats := metadata.NewStatistics(fcol, memory.DefaultAllocator) 180 dstats := metadata.NewStatistics(dcol, memory.DefaultAllocator) 181 assertMinMaxZeroesSign(fstats, []float32{-f32zero, -f32zero}) 182 assertMinMaxZeroesSign(dstats, []float64{-f64zero, -f64zero}) 183 } 184 { 185 fstats := metadata.NewStatistics(fcol, memory.DefaultAllocator) 186 dstats := metadata.NewStatistics(dcol, memory.DefaultAllocator) 187 assertMinMaxZeroesSign(fstats, []float32{f32zero, f32zero}) 188 assertMinMaxZeroesSign(dstats, []float64{f64zero, f64zero}) 189 } 190 } 191 192 func TestBooleanStatisticsEncoding(t *testing.T) { 193 n := schema.NewBooleanNode("boolean", parquet.Repetitions.Required, -1) 194 descr := schema.NewColumn(n, 0, 0) 195 s := metadata.NewStatistics(descr, nil) 196 bs := s.(*metadata.BooleanStatistics) 197 bs.SetMinMax(false, true) 198 maxEnc := bs.EncodeMax() 199 minEnc := bs.EncodeMin() 200 assert.Equal(t, []byte{1}, maxEnc) 201 assert.Equal(t, []byte{0}, minEnc) 202 }