github.com/apache/arrow/go/v14@v14.0.1/parquet/metadata/statistics_test.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package metadata_test
    18  
    19  import (
    20  	"math"
    21  	"reflect"
    22  	"testing"
    23  
    24  	"github.com/apache/arrow/go/v14/arrow/bitutil"
    25  	"github.com/apache/arrow/go/v14/arrow/memory"
    26  	"github.com/apache/arrow/go/v14/parquet"
    27  	"github.com/apache/arrow/go/v14/parquet/metadata"
    28  	"github.com/apache/arrow/go/v14/parquet/schema"
    29  	"github.com/stretchr/testify/assert"
    30  )
    31  
    32  // NOTE(zeroshade): tests will be added and updated after merging the "file" package
    33  // since the tests that I wrote relied on the file writer/reader for ease of use.
    34  
    35  func TestCheckNaNs(t *testing.T) {
    36  	const (
    37  		numvals = 8
    38  		min     = -4.0
    39  		max     = 3.0
    40  	)
    41  	nan := math.NaN()
    42  
    43  	allNans := []float64{nan, nan, nan, nan, nan, nan, nan, nan}
    44  	allNansf32 := make([]float32, numvals)
    45  	for idx, v := range allNans {
    46  		allNansf32[idx] = float32(v)
    47  	}
    48  
    49  	someNans := []float64{nan, max, -3.0, -1.0, nan, 2.0, min, nan}
    50  	someNansf32 := make([]float32, numvals)
    51  	for idx, v := range someNans {
    52  		someNansf32[idx] = float32(v)
    53  	}
    54  
    55  	validBitmap := []byte{0x7F}       // 0b01111111
    56  	validBitmapNoNaNs := []byte{0x6E} // 0b01101110
    57  
    58  	assertUnsetMinMax := func(stats metadata.TypedStatistics, values interface{}, bitmap []byte) {
    59  		if bitmap == nil {
    60  			switch s := stats.(type) {
    61  			case *metadata.Float32Statistics:
    62  				s.Update(values.([]float32), 0)
    63  			case *metadata.Float64Statistics:
    64  				s.Update(values.([]float64), 0)
    65  			}
    66  			assert.False(t, stats.HasMinMax())
    67  		} else {
    68  			nvalues := reflect.ValueOf(values).Len()
    69  			nullCount := bitutil.CountSetBits(bitmap, 0, nvalues)
    70  			switch s := stats.(type) {
    71  			case *metadata.Float32Statistics:
    72  				s.UpdateSpaced(values.([]float32), bitmap, 0, int64(nullCount))
    73  			case *metadata.Float64Statistics:
    74  				s.UpdateSpaced(values.([]float64), bitmap, 0, int64(nullCount))
    75  			}
    76  			assert.False(t, stats.HasMinMax())
    77  		}
    78  	}
    79  
    80  	assertMinMaxAre := func(stats metadata.TypedStatistics, values interface{}, expectedMin, expectedMax interface{}) {
    81  		switch s := stats.(type) {
    82  		case *metadata.Float32Statistics:
    83  			s.Update(values.([]float32), 0)
    84  			assert.True(t, stats.HasMinMax())
    85  			assert.Equal(t, expectedMin, s.Min())
    86  			assert.Equal(t, expectedMax, s.Max())
    87  		case *metadata.Float64Statistics:
    88  			s.Update(values.([]float64), 0)
    89  			assert.True(t, stats.HasMinMax())
    90  			assert.Equal(t, expectedMin, s.Min())
    91  			assert.Equal(t, expectedMax, s.Max())
    92  		}
    93  	}
    94  
    95  	assertMinMaxAreSpaced := func(stats metadata.TypedStatistics, values interface{}, bitmap []byte, expectedMin, expectedMax interface{}) {
    96  		nvalues := reflect.ValueOf(values).Len()
    97  		nullCount := bitutil.CountSetBits(bitmap, 0, nvalues)
    98  		switch s := stats.(type) {
    99  		case *metadata.Float32Statistics:
   100  			s.UpdateSpaced(values.([]float32), bitmap, 0, int64(nullCount))
   101  			assert.True(t, s.HasMinMax())
   102  			assert.Equal(t, expectedMin, s.Min())
   103  			assert.Equal(t, expectedMax, s.Max())
   104  		case *metadata.Float64Statistics:
   105  			s.UpdateSpaced(values.([]float64), bitmap, 0, int64(nullCount))
   106  			assert.True(t, s.HasMinMax())
   107  			assert.Equal(t, expectedMin, s.Min())
   108  			assert.Equal(t, expectedMax, s.Max())
   109  		}
   110  	}
   111  
   112  	f32Col := schema.NewColumn(schema.NewFloat32Node("f", parquet.Repetitions.Optional, -1), 1, 1)
   113  	f64Col := schema.NewColumn(schema.NewFloat64Node("f", parquet.Repetitions.Optional, -1), 1, 1)
   114  	// test values
   115  	someNanStats := metadata.NewStatistics(f64Col, memory.DefaultAllocator)
   116  	someNanStatsf32 := metadata.NewStatistics(f32Col, memory.DefaultAllocator)
   117  	// ingesting only nans should not yield a min or max
   118  	assertUnsetMinMax(someNanStats, allNans, nil)
   119  	assertUnsetMinMax(someNanStatsf32, allNansf32, nil)
   120  	// ingesting a mix should yield a valid min/max
   121  	assertMinMaxAre(someNanStats, someNans, min, max)
   122  	assertMinMaxAre(someNanStatsf32, someNansf32, float32(min), float32(max))
   123  	// ingesting only nans after a valid min/max should have no effect
   124  	assertMinMaxAre(someNanStats, allNans, min, max)
   125  	assertMinMaxAre(someNanStatsf32, allNansf32, float32(min), float32(max))
   126  
   127  	someNanStats = metadata.NewStatistics(f64Col, memory.DefaultAllocator)
   128  	someNanStatsf32 = metadata.NewStatistics(f32Col, memory.DefaultAllocator)
   129  	assertUnsetMinMax(someNanStats, allNans, validBitmap)
   130  	assertUnsetMinMax(someNanStatsf32, allNansf32, validBitmap)
   131  	// nans should not pollute min/max when excluded via null bitmap
   132  	assertMinMaxAreSpaced(someNanStats, someNans, validBitmapNoNaNs, min, max)
   133  	assertMinMaxAreSpaced(someNanStatsf32, someNansf32, validBitmapNoNaNs, float32(min), float32(max))
   134  	// ingesting nans with a null bitmap should not change the result
   135  	assertMinMaxAreSpaced(someNanStats, someNans, validBitmap, min, max)
   136  	assertMinMaxAreSpaced(someNanStatsf32, someNansf32, validBitmap, float32(min), float32(max))
   137  }
   138  
   139  func TestCheckNegativeZeroStats(t *testing.T) {
   140  	assertMinMaxZeroesSign := func(stats metadata.TypedStatistics, values interface{}) {
   141  		switch s := stats.(type) {
   142  		case *metadata.Float32Statistics:
   143  			s.Update(values.([]float32), 0)
   144  			assert.True(t, s.HasMinMax())
   145  			var zero float32
   146  			assert.Equal(t, zero, s.Min())
   147  			assert.True(t, math.Signbit(float64(s.Min())))
   148  			assert.Equal(t, zero, s.Max())
   149  			assert.False(t, math.Signbit(float64(s.Max())))
   150  		case *metadata.Float64Statistics:
   151  			s.Update(values.([]float64), 0)
   152  			assert.True(t, s.HasMinMax())
   153  			var zero float64
   154  			assert.Equal(t, zero, s.Min())
   155  			assert.True(t, math.Signbit(s.Min()))
   156  			assert.Equal(t, zero, s.Max())
   157  			assert.False(t, math.Signbit(s.Max()))
   158  		}
   159  	}
   160  
   161  	fcol := schema.NewColumn(schema.NewFloat32Node("f", parquet.Repetitions.Optional, -1), 1, 1)
   162  	dcol := schema.NewColumn(schema.NewFloat64Node("d", parquet.Repetitions.Optional, -1), 1, 1)
   163  
   164  	var f32zero float32
   165  	var f64zero float64
   166  	{
   167  		fstats := metadata.NewStatistics(fcol, memory.DefaultAllocator)
   168  		dstats := metadata.NewStatistics(dcol, memory.DefaultAllocator)
   169  		assertMinMaxZeroesSign(fstats, []float32{-f32zero, f32zero})
   170  		assertMinMaxZeroesSign(dstats, []float64{-f64zero, f64zero})
   171  	}
   172  	{
   173  		fstats := metadata.NewStatistics(fcol, memory.DefaultAllocator)
   174  		dstats := metadata.NewStatistics(dcol, memory.DefaultAllocator)
   175  		assertMinMaxZeroesSign(fstats, []float32{f32zero, -f32zero})
   176  		assertMinMaxZeroesSign(dstats, []float64{f64zero, -f64zero})
   177  	}
   178  	{
   179  		fstats := metadata.NewStatistics(fcol, memory.DefaultAllocator)
   180  		dstats := metadata.NewStatistics(dcol, memory.DefaultAllocator)
   181  		assertMinMaxZeroesSign(fstats, []float32{-f32zero, -f32zero})
   182  		assertMinMaxZeroesSign(dstats, []float64{-f64zero, -f64zero})
   183  	}
   184  	{
   185  		fstats := metadata.NewStatistics(fcol, memory.DefaultAllocator)
   186  		dstats := metadata.NewStatistics(dcol, memory.DefaultAllocator)
   187  		assertMinMaxZeroesSign(fstats, []float32{f32zero, f32zero})
   188  		assertMinMaxZeroesSign(dstats, []float64{f64zero, f64zero})
   189  	}
   190  }
   191  
   192  func TestBooleanStatisticsEncoding(t *testing.T) {
   193  	n := schema.NewBooleanNode("boolean", parquet.Repetitions.Required, -1)
   194  	descr := schema.NewColumn(n, 0, 0)
   195  	s := metadata.NewStatistics(descr, nil)
   196  	bs := s.(*metadata.BooleanStatistics)
   197  	bs.SetMinMax(false, true)
   198  	maxEnc := bs.EncodeMax()
   199  	minEnc := bs.EncodeMin()
   200  	assert.Equal(t, []byte{1}, maxEnc)
   201  	assert.Equal(t, []byte{0}, minEnc)
   202  }