github.com/apache/arrow/go/v14@v14.0.1/parquet/metadata/statistics.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package metadata
    18  
    19  import (
    20  	"bytes"
    21  	"encoding/binary"
    22  	"math"
    23  	"unsafe"
    24  
    25  	"github.com/apache/arrow/go/v14/arrow"
    26  	"github.com/apache/arrow/go/v14/arrow/memory"
    27  	"github.com/apache/arrow/go/v14/internal/utils"
    28  	"github.com/apache/arrow/go/v14/parquet"
    29  	"github.com/apache/arrow/go/v14/parquet/internal/debug"
    30  	"github.com/apache/arrow/go/v14/parquet/internal/encoding"
    31  	format "github.com/apache/arrow/go/v14/parquet/internal/gen-go/parquet"
    32  	"github.com/apache/arrow/go/v14/parquet/schema"
    33  )
    34  
    35  //go:generate go run ../../arrow/_tools/tmpl/main.go -i -data=../internal/encoding/physical_types.tmpldata statistics_types.gen.go.tmpl
    36  
    37  type StatProvider interface {
    38  	GetMin() []byte
    39  	GetMax() []byte
    40  	GetNullCount() int64
    41  	GetDistinctCount() int64
    42  	IsSetMax() bool
    43  	IsSetMin() bool
    44  	IsSetNullCount() bool
    45  	IsSetDistinctCount() bool
    46  }
    47  
    48  // EncodedStatistics are raw statistics with encoded values that will be written
    49  // to the parquet file, or was read from the parquet file.
    50  type EncodedStatistics struct {
    51  	HasMax           bool
    52  	Max              []byte
    53  	HasMin           bool
    54  	Min              []byte
    55  	Signed           bool
    56  	HasNullCount     bool
    57  	NullCount        int64
    58  	HasDistinctCount bool
    59  	DistinctCount    int64
    60  }
    61  
    62  // ApplyStatSizeLimits sets the maximum size of the min/max values.
    63  //
    64  // from parquet-mr
    65  // we don't write stats larger than the max size rather than truncating.
    66  // the rationale is that some engines may use the minimum value in the page
    67  // as the true minimum for aggregations and there is no way to mark that
    68  // a value has been truncated and is a lower bound and not in the page
    69  func (e *EncodedStatistics) ApplyStatSizeLimits(length int) {
    70  	if len(e.Max) > length {
    71  		e.HasMax = false
    72  	}
    73  	if len(e.Min) > length {
    74  		e.HasMin = false
    75  	}
    76  }
    77  
    78  // IsSet returns true iff one of the Has* values is true.
    79  func (e *EncodedStatistics) IsSet() bool {
    80  	return e.HasMin || e.HasMax || e.HasNullCount || e.HasDistinctCount
    81  }
    82  
    83  // SetMax sets the encoded Max value to val and sets HasMax to true
    84  func (e *EncodedStatistics) SetMax(val []byte) *EncodedStatistics {
    85  	e.Max = val[:]
    86  	e.HasMax = true
    87  	return e
    88  }
    89  
    90  // SetMin sets the encoded Min value to val, and sets HasMin to true
    91  func (e *EncodedStatistics) SetMin(val []byte) *EncodedStatistics {
    92  	e.Min = val[:]
    93  	e.HasMin = true
    94  	return e
    95  }
    96  
    97  // SetNullCount sets the NullCount to val and sets HasNullCount to true
    98  func (e *EncodedStatistics) SetNullCount(val int64) *EncodedStatistics {
    99  	e.NullCount = val
   100  	e.HasNullCount = true
   101  	return e
   102  }
   103  
   104  // SetDistinctCount sets the DistinctCount to val and sets HasDistinctCount to true
   105  func (e *EncodedStatistics) SetDistinctCount(val int64) *EncodedStatistics {
   106  	e.DistinctCount = val
   107  	e.HasDistinctCount = true
   108  	return e
   109  }
   110  
   111  func (e *EncodedStatistics) ToThrift() (stats *format.Statistics) {
   112  	stats = format.NewStatistics()
   113  	if e.HasMin {
   114  		stats.MinValue = e.Min
   115  		// if sort order is SIGNED then the old min value must be set too for backwards compatibility
   116  		if e.Signed {
   117  			stats.Min = e.Min
   118  		}
   119  	}
   120  	if e.HasMax {
   121  		stats.MaxValue = e.Max
   122  		// if sort order is SIGNED then old max value must be set to
   123  		if e.Signed {
   124  			stats.Max = e.Max
   125  		}
   126  	}
   127  	if e.HasNullCount {
   128  		stats.NullCount = &e.NullCount
   129  	}
   130  	if e.HasDistinctCount {
   131  		stats.DistinctCount = &e.DistinctCount
   132  	}
   133  	return
   134  }
   135  
   136  // TypedStatistics is the base interface for dealing with stats as
   137  // they are being populated
   138  type TypedStatistics interface {
   139  	// Type is the underlying physical type for this stat block
   140  	Type() parquet.Type
   141  	// Returns true if there is a min and max value set for this stat object
   142  	HasMinMax() bool
   143  	// Returns true if a nullcount has been set
   144  	HasNullCount() bool
   145  	// returns true only if a distinct count has been set
   146  	// current implementation does of the writer does not automatically populate
   147  	// the distinct count right now.
   148  	HasDistinctCount() bool
   149  	NullCount() int64
   150  	DistinctCount() int64
   151  	NumValues() int64
   152  	// return the column descriptor that this stat object was initialized with
   153  	Descr() *schema.Column
   154  
   155  	// Encode the current min value and return the bytes. ByteArray does not
   156  	// include the len in the encoded bytes, otherwise this is identical to
   157  	// plain encoding
   158  	EncodeMin() []byte
   159  	// Encode the current max value and return the bytes. ByteArray does not
   160  	// include the len in the encoded bytes, otherwise this is identical to
   161  	// plain encoding
   162  	EncodeMax() []byte
   163  	// Populate an EncodedStatistics object from the current stats
   164  	Encode() (EncodedStatistics, error)
   165  	// Resets all values to 0 to enable reusing this stat object for multiple
   166  	// columns, by calling Encode to get the finished values and then calling
   167  	// reset
   168  	Reset()
   169  	// Merge the min/max/nullcounts and distinct count from the passed stat object
   170  	// into this one.
   171  	Merge(TypedStatistics)
   172  
   173  	// UpdateFromArrow updates the statistics from an Arrow Array,
   174  	// only updating the null and num value counts if updateCounts
   175  	// is true.
   176  	UpdateFromArrow(values arrow.Array, updateCounts bool) error
   177  	// IncNulls increments the number of nulls in the statistics
   178  	// and marks HasNullCount as true
   179  	IncNulls(int64)
   180  	// IncDistinct increments the number of distinct values in
   181  	// the statistics and marks HasDistinctCount as true
   182  	IncDistinct(int64)
   183  	// IncNumValues increments the total number of values in
   184  	// the statistics
   185  	IncNumValues(int64)
   186  }
   187  
   188  type statistics struct {
   189  	descr            *schema.Column
   190  	hasMinMax        bool
   191  	hasNullCount     bool
   192  	hasDistinctCount bool
   193  	mem              memory.Allocator
   194  	nvalues          int64
   195  	stats            EncodedStatistics
   196  	order            schema.SortOrder
   197  
   198  	encoder encoding.TypedEncoder
   199  }
   200  
   201  func (s *statistics) IncNumValues(n int64) {
   202  	s.nvalues += n
   203  }
   204  func (s *statistics) IncNulls(n int64) {
   205  	s.stats.NullCount += n
   206  	s.hasNullCount = true
   207  }
   208  func (s *statistics) IncDistinct(n int64) {
   209  	s.stats.DistinctCount += n
   210  	s.hasDistinctCount = true
   211  }
   212  
   213  func (s *statistics) Descr() *schema.Column  { return s.descr }
   214  func (s *statistics) Type() parquet.Type     { return s.descr.PhysicalType() }
   215  func (s *statistics) HasDistinctCount() bool { return s.hasDistinctCount }
   216  func (s *statistics) HasMinMax() bool        { return s.hasMinMax }
   217  func (s *statistics) HasNullCount() bool     { return s.hasNullCount }
   218  func (s *statistics) NullCount() int64       { return s.stats.NullCount }
   219  func (s *statistics) DistinctCount() int64   { return s.stats.DistinctCount }
   220  func (s *statistics) NumValues() int64       { return s.nvalues }
   221  
   222  func (s *statistics) Reset() {
   223  	s.stats.NullCount = 0
   224  	s.stats.DistinctCount = 0
   225  	s.nvalues = 0
   226  	s.hasMinMax = false
   227  	s.hasDistinctCount = false
   228  	s.hasNullCount = false
   229  }
   230  
   231  // base merge function for base non-typed stat object so we don't have to
   232  // duplicate this in each of the typed implementations
   233  func (s *statistics) merge(other TypedStatistics) {
   234  	s.nvalues += other.NumValues()
   235  	if other.HasNullCount() {
   236  		s.stats.NullCount += other.NullCount()
   237  	}
   238  	if other.HasDistinctCount() {
   239  		// this isn't technically correct as it should be keeping an actual set
   240  		// of the distinct values and then combining the sets to get a new count
   241  		// but for now we'll do this to match the C++ implementation at the current
   242  		// time.
   243  		s.stats.DistinctCount += other.DistinctCount()
   244  	}
   245  }
   246  
   247  func coalesce(val, fallback interface{}) interface{} {
   248  	switch v := val.(type) {
   249  	case float32:
   250  		if math.IsNaN(float64(v)) {
   251  			return fallback
   252  		}
   253  	case float64:
   254  		if math.IsNaN(v) {
   255  			return fallback
   256  		}
   257  	}
   258  	return val
   259  }
   260  
   261  func signedByteLess(a, b []byte) bool {
   262  	// signed comparison is used for integers encoded as big-endian twos complement
   263  	// integers (e.g. decimals)
   264  
   265  	// if at least one of the lengths is zero, we can short circuit
   266  	if len(a) == 0 || len(b) == 0 {
   267  		return len(a) == 0 && len(b) > 0
   268  	}
   269  
   270  	sa := *(*[]int8)(unsafe.Pointer(&a))
   271  	sb := *(*[]int8)(unsafe.Pointer(&b))
   272  
   273  	// we can short circuit for different signd numbers or for equal length byte
   274  	// arrays that have different first bytes. The equality requirement is necessary
   275  	// for sign extension cases. 0xFF10 should be equal to 0x10 (due to big endian sign extension)
   276  	if int8(0x80&uint8(sa[0])) != int8(0x80&uint8(sb[0])) || (len(sa) == len(sb) && sa[0] != sb[0]) {
   277  		return sa[0] < sb[0]
   278  	}
   279  
   280  	// when the lengths are unequal and the numbers are of the same sign, we need
   281  	// to do comparison by sign extending the shorter value first, and once we get
   282  	// to equal sized arrays, lexicographical unsigned comparison of everything but
   283  	// the first byte is sufficient.
   284  
   285  	if len(a) != len(b) {
   286  		var lead []byte
   287  		if len(a) > len(b) {
   288  			leadLen := len(a) - len(b)
   289  			lead = a[:leadLen]
   290  			a = a[leadLen:]
   291  		} else {
   292  			debug.Assert(len(a) < len(b), "something weird in byte slice signed comparison")
   293  			leadLen := len(b) - len(a)
   294  			lead = b[:leadLen]
   295  			b = b[leadLen:]
   296  		}
   297  
   298  		// compare extra bytes to the sign extension of the first byte of the other number
   299  		var extension byte
   300  		if sa[0] < 0 {
   301  			extension = 0xFF
   302  		}
   303  
   304  		notequal := false
   305  		for _, c := range lead {
   306  			if c != extension {
   307  				notequal = true
   308  				break
   309  			}
   310  		}
   311  
   312  		if notequal {
   313  			// since sign extension are extrema values for unsigned bytes:
   314  			//
   315  			// Four cases exist:
   316  			//	 negative values:
   317  			//	   b is the longer value
   318  			//       b must be the lesser value: return false
   319  			//     else:
   320  			//       a must be the lesser value: return true
   321  			//
   322  			//   positive values:
   323  			//     b is the longer value
   324  			//       values in b must be greater than a: return true
   325  			//     else:
   326  			//       values in a must be greater than b: return false
   327  			neg := sa[0] < 0
   328  			blonger := len(sa) < len(sb)
   329  			return neg != blonger
   330  		}
   331  	} else {
   332  		a = a[1:]
   333  		b = b[1:]
   334  	}
   335  
   336  	return bytes.Compare(a, b) == -1
   337  }
   338  
   339  func (BooleanStatistics) defaultMin() bool { return true }
   340  func (BooleanStatistics) defaultMax() bool { return false }
   341  func (s *Int32Statistics) defaultMin() int32 {
   342  	if s.order == schema.SortUNSIGNED {
   343  		val := uint32(math.MaxUint32)
   344  		return int32(val)
   345  	}
   346  	return math.MaxInt32
   347  }
   348  
   349  func (s *Int32Statistics) defaultMax() int32 {
   350  	if s.order == schema.SortUNSIGNED {
   351  		return int32(0)
   352  	}
   353  	return math.MinInt32
   354  }
   355  
   356  func (s *Int64Statistics) defaultMin() int64 {
   357  	if s.order == schema.SortUNSIGNED {
   358  		val := uint64(math.MaxUint64)
   359  		return int64(val)
   360  	}
   361  	return math.MaxInt64
   362  }
   363  
   364  func (s *Int64Statistics) defaultMax() int64 {
   365  	if s.order == schema.SortUNSIGNED {
   366  		return int64(0)
   367  	}
   368  	return math.MinInt64
   369  }
   370  
   371  var (
   372  	defaultMinInt96  parquet.Int96
   373  	defaultMinUInt96 parquet.Int96
   374  	defaultMaxInt96  parquet.Int96
   375  	defaultMaxUInt96 parquet.Int96
   376  )
   377  
   378  func init() {
   379  	i96 := arrow.Uint32Traits.CastFromBytes(defaultMinInt96[:])
   380  	i96[0] = math.MaxUint32
   381  	i96[1] = math.MaxUint32
   382  	i96[2] = math.MaxInt32
   383  
   384  	i96 = arrow.Uint32Traits.CastFromBytes(defaultMinUInt96[:])
   385  	i96[0] = math.MaxUint32
   386  	i96[1] = math.MaxUint32
   387  	i96[2] = math.MaxUint32
   388  
   389  	// golang will initialize the bytes to 0
   390  	i96 = arrow.Uint32Traits.CastFromBytes(defaultMaxInt96[:])
   391  	i96[2] = math.MaxInt32 + 1
   392  
   393  	// defaultMaxUInt96 will be initialized to 0 as desired
   394  }
   395  
   396  func (s *Int96Statistics) defaultMin() parquet.Int96 {
   397  	if s.order == schema.SortUNSIGNED {
   398  		return defaultMinUInt96
   399  	}
   400  	return defaultMinInt96
   401  }
   402  
   403  func (s *Int96Statistics) defaultMax() parquet.Int96 {
   404  	if s.order == schema.SortUNSIGNED {
   405  		return defaultMaxUInt96
   406  	}
   407  	return defaultMaxInt96
   408  }
   409  
   410  func (Float32Statistics) defaultMin() float32                             { return math.MaxFloat32 }
   411  func (Float32Statistics) defaultMax() float32                             { return -math.MaxFloat32 }
   412  func (Float64Statistics) defaultMin() float64                             { return math.MaxFloat64 }
   413  func (Float64Statistics) defaultMax() float64                             { return -math.MaxFloat64 }
   414  func (ByteArrayStatistics) defaultMin() parquet.ByteArray                 { return nil }
   415  func (ByteArrayStatistics) defaultMax() parquet.ByteArray                 { return nil }
   416  func (FixedLenByteArrayStatistics) defaultMin() parquet.FixedLenByteArray { return nil }
   417  func (FixedLenByteArrayStatistics) defaultMax() parquet.FixedLenByteArray { return nil }
   418  
   419  func (BooleanStatistics) equal(a, b bool) bool                { return a == b }
   420  func (Int32Statistics) equal(a, b int32) bool                 { return a == b }
   421  func (Int64Statistics) equal(a, b int64) bool                 { return a == b }
   422  func (Float32Statistics) equal(a, b float32) bool             { return a == b }
   423  func (Float64Statistics) equal(a, b float64) bool             { return a == b }
   424  func (Int96Statistics) equal(a, b parquet.Int96) bool         { return bytes.Equal(a[:], b[:]) }
   425  func (ByteArrayStatistics) equal(a, b parquet.ByteArray) bool { return bytes.Equal(a, b) }
   426  func (FixedLenByteArrayStatistics) equal(a, b parquet.FixedLenByteArray) bool {
   427  	return bytes.Equal(a, b)
   428  }
   429  
   430  func (BooleanStatistics) less(a, b bool) bool {
   431  	return !a && b
   432  }
   433  
   434  func (s *Int32Statistics) less(a, b int32) bool {
   435  	if s.order == schema.SortUNSIGNED {
   436  		return uint32(a) < uint32(b)
   437  	}
   438  	return a < b
   439  }
   440  
   441  func (s *Int64Statistics) less(a, b int64) bool {
   442  	if s.order == schema.SortUNSIGNED {
   443  		return uint64(a) < uint64(b)
   444  	}
   445  	return a < b
   446  }
   447  func (Float32Statistics) less(a, b float32) bool { return a < b }
   448  func (Float64Statistics) less(a, b float64) bool { return a < b }
   449  func (s *Int96Statistics) less(a, b parquet.Int96) bool {
   450  	i96a := arrow.Uint32Traits.CastFromBytes(a[:])
   451  	i96b := arrow.Uint32Traits.CastFromBytes(b[:])
   452  
   453  	a0, a1, a2 := utils.ToLEUint32(i96a[0]), utils.ToLEUint32(i96a[1]), utils.ToLEUint32(i96a[2])
   454  	b0, b1, b2 := utils.ToLEUint32(i96b[0]), utils.ToLEUint32(i96b[1]), utils.ToLEUint32(i96b[2])
   455  
   456  	if a2 != b2 {
   457  		// only the msb bit is by signed comparison
   458  		if s.order == schema.SortSIGNED {
   459  			return int32(a2) < int32(b2)
   460  		}
   461  		return a2 < b2
   462  	} else if a1 != b1 {
   463  		return a1 < b1
   464  	}
   465  	return a0 < b0
   466  }
   467  
   468  func (s *ByteArrayStatistics) less(a, b parquet.ByteArray) bool {
   469  	if s.order == schema.SortUNSIGNED {
   470  		return bytes.Compare(a, b) == -1
   471  	}
   472  
   473  	return signedByteLess([]byte(a), []byte(b))
   474  }
   475  
   476  func (s *FixedLenByteArrayStatistics) less(a, b parquet.FixedLenByteArray) bool {
   477  	if s.order == schema.SortUNSIGNED {
   478  		return bytes.Compare(a, b) == -1
   479  	}
   480  
   481  	return signedByteLess([]byte(a), []byte(b))
   482  }
   483  
   484  func (BooleanStatistics) cleanStat(minMax minmaxPairBoolean) *minmaxPairBoolean { return &minMax }
   485  func (Int32Statistics) cleanStat(minMax minmaxPairInt32) *minmaxPairInt32       { return &minMax }
   486  func (Int64Statistics) cleanStat(minMax minmaxPairInt64) *minmaxPairInt64       { return &minMax }
   487  func (Int96Statistics) cleanStat(minMax minmaxPairInt96) *minmaxPairInt96       { return &minMax }
   488  
   489  // in the case of floating point types, the following rules are applied as per parquet-mr:
   490  // - if any of min/max is NaN, return nothing
   491  // - if min is 0.0f replace with -0.0f
   492  // - if max is -0.0f replace with 0.0f
   493  //
   494  // https://issues.apache.org/jira/browse/PARQUET-1222 tracks the official documenting of
   495  // a well-defined order for floats and doubles.
   496  func (Float32Statistics) cleanStat(minMax minmaxPairFloat32) *minmaxPairFloat32 {
   497  	if math.IsNaN(float64(minMax[0])) || math.IsNaN(float64(minMax[1])) {
   498  		return nil
   499  	}
   500  
   501  	if minMax[0] == math.MaxFloat32 && minMax[1] == -math.MaxFloat32 {
   502  		return nil
   503  	}
   504  
   505  	var zero float32 = 0
   506  	if minMax[0] == zero && !math.Signbit(float64(minMax[0])) {
   507  		minMax[0] = -minMax[0]
   508  	}
   509  
   510  	if minMax[1] == zero && math.Signbit(float64(minMax[1])) {
   511  		minMax[1] = -minMax[1]
   512  	}
   513  
   514  	return &minMax
   515  }
   516  
   517  func (Float64Statistics) cleanStat(minMax minmaxPairFloat64) *minmaxPairFloat64 {
   518  	if math.IsNaN(minMax[0]) || math.IsNaN(minMax[1]) {
   519  		return nil
   520  	}
   521  
   522  	if minMax[0] == math.MaxFloat64 && minMax[1] == -math.MaxFloat64 {
   523  		return nil
   524  	}
   525  
   526  	var zero float64 = 0
   527  	if minMax[0] == zero && !math.Signbit(minMax[0]) {
   528  		minMax[0] = -minMax[0]
   529  	}
   530  
   531  	if minMax[1] == zero && math.Signbit(minMax[1]) {
   532  		minMax[1] = -minMax[1]
   533  	}
   534  
   535  	return &minMax
   536  }
   537  
   538  func (ByteArrayStatistics) cleanStat(minMax minmaxPairByteArray) *minmaxPairByteArray {
   539  	if minMax[0] == nil || minMax[1] == nil {
   540  		return nil
   541  	}
   542  	return &minMax
   543  }
   544  
   545  func (FixedLenByteArrayStatistics) cleanStat(minMax minmaxPairFixedLenByteArray) *minmaxPairFixedLenByteArray {
   546  	if minMax[0] == nil || minMax[1] == nil {
   547  		return nil
   548  	}
   549  	return &minMax
   550  }
   551  
   552  func GetStatValue(typ parquet.Type, val []byte) interface{} {
   553  	switch typ {
   554  	case parquet.Types.Boolean:
   555  		return val[0] != 0
   556  	case parquet.Types.Int32:
   557  		return int32(binary.LittleEndian.Uint32(val))
   558  	case parquet.Types.Int64:
   559  		return int64(binary.LittleEndian.Uint64(val))
   560  	case parquet.Types.Int96:
   561  		p := parquet.Int96{}
   562  		copy(p[:], val)
   563  		return p
   564  	case parquet.Types.Float:
   565  		return math.Float32frombits(binary.LittleEndian.Uint32(val))
   566  	case parquet.Types.Double:
   567  		return math.Float64frombits(binary.LittleEndian.Uint64(val))
   568  	case parquet.Types.ByteArray:
   569  		fallthrough
   570  	case parquet.Types.FixedLenByteArray:
   571  		return val
   572  	}
   573  	return nil
   574  }