github.com/apache/arrow/go/v7@v7.0.1/parquet/metadata/statistics.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package metadata
    18  
    19  import (
    20  	"bytes"
    21  	"encoding/binary"
    22  	"math"
    23  	"unsafe"
    24  
    25  	"github.com/apache/arrow/go/v7/arrow"
    26  	"github.com/apache/arrow/go/v7/arrow/memory"
    27  	"github.com/apache/arrow/go/v7/parquet"
    28  	"github.com/apache/arrow/go/v7/parquet/internal/debug"
    29  	"github.com/apache/arrow/go/v7/parquet/internal/encoding"
    30  	"github.com/apache/arrow/go/v7/parquet/internal/utils"
    31  	format "github.com/apache/arrow/go/v7/parquet/internal/gen-go/parquet"
    32  	"github.com/apache/arrow/go/v7/parquet/schema"
    33  )
    34  
    35  //go:generate go run ../../arrow/_tools/tmpl/main.go -i -data=../internal/encoding/physical_types.tmpldata statistics_types.gen.go.tmpl
    36  
    37  type StatProvider interface {
    38  	GetMin() []byte
    39  	GetMax() []byte
    40  	GetNullCount() int64
    41  	GetDistinctCount() int64
    42  	IsSetMax() bool
    43  	IsSetMin() bool
    44  	IsSetNullCount() bool
    45  	IsSetDistinctCount() bool
    46  }
    47  
    48  // EncodedStatistics are raw statistics with encoded values that will be written
    49  // to the parquet file, or was read from the parquet file.
    50  type EncodedStatistics struct {
    51  	HasMax           bool
    52  	Max              []byte
    53  	HasMin           bool
    54  	Min              []byte
    55  	Signed           bool
    56  	HasNullCount     bool
    57  	NullCount        int64
    58  	HasDistinctCount bool
    59  	DistinctCount    int64
    60  }
    61  
    62  // ApplyStatSizeLimits sets the maximum size of the min/max values.
    63  //
    64  // from parquet-mr
    65  // we don't write stats larger than the max size rather than truncating.
    66  // the rationale is that some engines may use the minimum value in the page
    67  // as the true minimum for aggregations and there is no way to mark that
    68  // a value has been truncated and is a lower bound and not in the page
    69  func (e *EncodedStatistics) ApplyStatSizeLimits(length int) {
    70  	if len(e.Max) > length {
    71  		e.HasMax = false
    72  	}
    73  	if len(e.Min) > length {
    74  		e.HasMin = false
    75  	}
    76  }
    77  
    78  // IsSet returns true iff one of the Has* values is true.
    79  func (e *EncodedStatistics) IsSet() bool {
    80  	return e.HasMin || e.HasMax || e.HasNullCount || e.HasDistinctCount
    81  }
    82  
    83  // SetMax sets the encoded Max value to val and sets HasMax to true
    84  func (e *EncodedStatistics) SetMax(val []byte) *EncodedStatistics {
    85  	e.Max = val[:]
    86  	e.HasMax = true
    87  	return e
    88  }
    89  
    90  // SetMin sets the encoded Min value to val, and sets HasMin to true
    91  func (e *EncodedStatistics) SetMin(val []byte) *EncodedStatistics {
    92  	e.Min = val[:]
    93  	e.HasMin = true
    94  	return e
    95  }
    96  
    97  // SetNullCount sets the NullCount to val and sets HasNullCount to true
    98  func (e *EncodedStatistics) SetNullCount(val int64) *EncodedStatistics {
    99  	e.NullCount = val
   100  	e.HasNullCount = true
   101  	return e
   102  }
   103  
   104  // SetDistinctCount sets the DistinctCount to val and sets HasDistinctCount to true
   105  func (e *EncodedStatistics) SetDistinctCount(val int64) *EncodedStatistics {
   106  	e.DistinctCount = val
   107  	e.HasDistinctCount = true
   108  	return e
   109  }
   110  
   111  func (e *EncodedStatistics) ToThrift() (stats *format.Statistics) {
   112  	stats = format.NewStatistics()
   113  	if e.HasMin {
   114  		stats.MinValue = e.Min
   115  		// if sort order is SIGNED then the old min value must be set too for backwards compatibility
   116  		if e.Signed {
   117  			stats.Min = e.Min
   118  		}
   119  	}
   120  	if e.HasMax {
   121  		stats.MaxValue = e.Max
   122  		// if sort order is SIGNED then old max value must be set to
   123  		if e.Signed {
   124  			stats.Max = e.Max
   125  		}
   126  	}
   127  	if e.HasNullCount {
   128  		stats.NullCount = &e.NullCount
   129  	}
   130  	if e.HasDistinctCount {
   131  		stats.DistinctCount = &e.DistinctCount
   132  	}
   133  	return
   134  }
   135  
   136  // TypedStatistics is the base interface for dealing with stats as
   137  // they are being populated
   138  type TypedStatistics interface {
   139  	// Type is the underlying physical type for this stat block
   140  	Type() parquet.Type
   141  	// Returns true if there is a min and max value set for this stat object
   142  	HasMinMax() bool
   143  	// Returns true if a nullcount has been set
   144  	HasNullCount() bool
   145  	// returns true only if a distinct count has been set
   146  	// current implementation does of the writer does not automatically populate
   147  	// the distinct count right now.
   148  	HasDistinctCount() bool
   149  	NullCount() int64
   150  	DistinctCount() int64
   151  	NumValues() int64
   152  	// return the column descriptor that this stat object was initialized with
   153  	Descr() *schema.Column
   154  
   155  	// Encode the current min value and return the bytes. ByteArray does not
   156  	// include the len in the encoded bytes, otherwise this is identical to
   157  	// plain encoding
   158  	EncodeMin() []byte
   159  	// Encode the current max value and return the bytes. ByteArray does not
   160  	// include the len in the encoded bytes, otherwise this is identical to
   161  	// plain encoding
   162  	EncodeMax() []byte
   163  	// Populate an EncodedStatistics object from the current stats
   164  	Encode() (EncodedStatistics, error)
   165  	// Resets all values to 0 to enable reusing this stat object for multiple
   166  	// columns, by calling Encode to get the finished values and then calling
   167  	// reset
   168  	Reset()
   169  	// Merge the min/max/nullcounts and distinct count from the passed stat object
   170  	// into this one.
   171  	Merge(TypedStatistics)
   172  }
   173  
   174  type statistics struct {
   175  	descr            *schema.Column
   176  	hasMinMax        bool
   177  	hasNullCount     bool
   178  	hasDistinctCount bool
   179  	mem              memory.Allocator
   180  	nvalues          int64
   181  	stats            EncodedStatistics
   182  	order            schema.SortOrder
   183  
   184  	encoder encoding.TypedEncoder
   185  }
   186  
   187  func (s *statistics) incNulls(n int64) {
   188  	s.stats.NullCount += n
   189  	s.hasNullCount = true
   190  }
   191  func (s *statistics) incDistinct(n int64) {
   192  	s.stats.DistinctCount += n
   193  	s.hasDistinctCount = true
   194  }
   195  
   196  func (s *statistics) Descr() *schema.Column  { return s.descr }
   197  func (s *statistics) Type() parquet.Type     { return s.descr.PhysicalType() }
   198  func (s *statistics) HasDistinctCount() bool { return s.hasDistinctCount }
   199  func (s *statistics) HasMinMax() bool        { return s.hasMinMax }
   200  func (s *statistics) HasNullCount() bool     { return s.hasNullCount }
   201  func (s *statistics) NullCount() int64       { return s.stats.NullCount }
   202  func (s *statistics) DistinctCount() int64   { return s.stats.DistinctCount }
   203  func (s *statistics) NumValues() int64       { return s.nvalues }
   204  
   205  func (s *statistics) Reset() {
   206  	s.stats.NullCount = 0
   207  	s.stats.DistinctCount = 0
   208  	s.nvalues = 0
   209  	s.hasMinMax = false
   210  	s.hasDistinctCount = false
   211  	s.hasNullCount = false
   212  }
   213  
   214  // base merge function for base non-typed stat object so we don't have to
   215  // duplicate this in each of the typed implementations
   216  func (s *statistics) merge(other TypedStatistics) {
   217  	s.nvalues += other.NumValues()
   218  	if other.HasNullCount() {
   219  		s.stats.NullCount += other.NullCount()
   220  	}
   221  	if other.HasDistinctCount() {
   222  		// this isn't technically correct as it should be keeping an actual set
   223  		// of the distinct values and then combining the sets to get a new count
   224  		// but for now we'll do this to match the C++ implementation at the current
   225  		// time.
   226  		s.stats.DistinctCount += other.DistinctCount()
   227  	}
   228  }
   229  
   230  func coalesce(val, fallback interface{}) interface{} {
   231  	switch v := val.(type) {
   232  	case float32:
   233  		if math.IsNaN(float64(v)) {
   234  			return fallback
   235  		}
   236  	case float64:
   237  		if math.IsNaN(v) {
   238  			return fallback
   239  		}
   240  	}
   241  	return val
   242  }
   243  
   244  func signedByteLess(a, b []byte) bool {
   245  	// signed comparison is used for integers encoded as big-endian twos complement
   246  	// integers (e.g. decimals)
   247  
   248  	// if at least one of the lengths is zero, we can short circuit
   249  	if len(a) == 0 || len(b) == 0 {
   250  		return len(a) == 0 && len(b) > 0
   251  	}
   252  
   253  	sa := *(*[]int8)(unsafe.Pointer(&a))
   254  	sb := *(*[]int8)(unsafe.Pointer(&b))
   255  
   256  	// we can short circuit for different signd numbers or for equal length byte
   257  	// arrays that have different first bytes. The equality requirement is necessary
   258  	// for sign extension cases. 0xFF10 should be equal to 0x10 (due to big endian sign extension)
   259  	if int8(0x80&uint8(sa[0])) != int8(0x80&uint8(sb[0])) || (len(sa) == len(sb) && sa[0] != sb[0]) {
   260  		return sa[0] < sb[0]
   261  	}
   262  
   263  	// when the lengths are unequal and the numbers are of the same sign, we need
   264  	// to do comparison by sign extending the shorter value first, and once we get
   265  	// to equal sized arrays, lexicographical unsigned comparison of everything but
   266  	// the first byte is sufficient.
   267  
   268  	if len(a) != len(b) {
   269  		var lead []byte
   270  		if len(a) > len(b) {
   271  			leadLen := len(a) - len(b)
   272  			lead = a[:leadLen]
   273  			a = a[leadLen:]
   274  		} else {
   275  			debug.Assert(len(a) < len(b), "something weird in byte slice signed comparison")
   276  			leadLen := len(b) - len(a)
   277  			lead = b[:leadLen]
   278  			b = b[leadLen:]
   279  		}
   280  
   281  		// compare extra bytes to the sign extension of the first byte of the other number
   282  		var extension byte
   283  		if sa[0] < 0 {
   284  			extension = 0xFF
   285  		}
   286  
   287  		notequal := false
   288  		for _, c := range lead {
   289  			if c != extension {
   290  				notequal = true
   291  				break
   292  			}
   293  		}
   294  
   295  		if notequal {
   296  			// since sign extension are extrema values for unsigned bytes:
   297  			//
   298  			// Four cases exist:
   299  			//	 negative values:
   300  			//	   b is the longer value
   301  			//       b must be the lesser value: return false
   302  			//     else:
   303  			//       a must be the lesser value: return true
   304  			//
   305  			//   positive values:
   306  			//     b is the longer value
   307  			//       values in b must be greater than a: return true
   308  			//     else:
   309  			//       values in a must be greater than b: return false
   310  			neg := sa[0] < 0
   311  			blonger := len(sa) < len(sb)
   312  			return neg != blonger
   313  		}
   314  	} else {
   315  		a = a[1:]
   316  		b = b[1:]
   317  	}
   318  
   319  	return bytes.Compare(a, b) == -1
   320  }
   321  
   322  func (BooleanStatistics) defaultMin() bool { return true }
   323  func (BooleanStatistics) defaultMax() bool { return false }
   324  func (s *Int32Statistics) defaultMin() int32 {
   325  	if s.order == schema.SortUNSIGNED {
   326  		val := math.MaxUint32
   327  		return int32(val)
   328  	}
   329  	return math.MaxInt32
   330  }
   331  
   332  func (s *Int32Statistics) defaultMax() int32 {
   333  	if s.order == schema.SortUNSIGNED {
   334  		return int32(0)
   335  	}
   336  	return math.MinInt32
   337  }
   338  
   339  func (s *Int64Statistics) defaultMin() int64 {
   340  	if s.order == schema.SortUNSIGNED {
   341  		val := uint64(math.MaxUint64)
   342  		return int64(val)
   343  	}
   344  	return math.MaxInt64
   345  }
   346  
   347  func (s *Int64Statistics) defaultMax() int64 {
   348  	if s.order == schema.SortUNSIGNED {
   349  		return int64(0)
   350  	}
   351  	return math.MinInt64
   352  }
   353  
   354  var (
   355  	defaultMinInt96  parquet.Int96
   356  	defaultMinUInt96 parquet.Int96
   357  	defaultMaxInt96  parquet.Int96
   358  	defaultMaxUInt96 parquet.Int96
   359  )
   360  
   361  func init() {
   362  	i96 := arrow.Uint32Traits.CastFromBytes(defaultMinInt96[:])
   363  	i96[0] = math.MaxUint32
   364  	i96[1] = math.MaxUint32
   365  	i96[2] = math.MaxInt32
   366  
   367  	i96 = arrow.Uint32Traits.CastFromBytes(defaultMinUInt96[:])
   368  	i96[0] = math.MaxUint32
   369  	i96[1] = math.MaxUint32
   370  	i96[2] = math.MaxUint32
   371  
   372  	// golang will initialize the bytes to 0
   373  	i96 = arrow.Uint32Traits.CastFromBytes(defaultMaxInt96[:])
   374  	i96[2] = math.MaxInt32 + 1
   375  
   376  	// defaultMaxUInt96 will be initialized to 0 as desired
   377  }
   378  
   379  func (s *Int96Statistics) defaultMin() parquet.Int96 {
   380  	if s.order == schema.SortUNSIGNED {
   381  		return defaultMinUInt96
   382  	}
   383  	return defaultMinInt96
   384  }
   385  
   386  func (s *Int96Statistics) defaultMax() parquet.Int96 {
   387  	if s.order == schema.SortUNSIGNED {
   388  		return defaultMaxUInt96
   389  	}
   390  	return defaultMaxInt96
   391  }
   392  
   393  func (Float32Statistics) defaultMin() float32                             { return math.MaxFloat32 }
   394  func (Float32Statistics) defaultMax() float32                             { return -math.MaxFloat32 }
   395  func (Float64Statistics) defaultMin() float64                             { return math.MaxFloat64 }
   396  func (Float64Statistics) defaultMax() float64                             { return -math.MaxFloat64 }
   397  func (ByteArrayStatistics) defaultMin() parquet.ByteArray                 { return nil }
   398  func (ByteArrayStatistics) defaultMax() parquet.ByteArray                 { return nil }
   399  func (FixedLenByteArrayStatistics) defaultMin() parquet.FixedLenByteArray { return nil }
   400  func (FixedLenByteArrayStatistics) defaultMax() parquet.FixedLenByteArray { return nil }
   401  
   402  func (BooleanStatistics) equal(a, b bool) bool                { return a == b }
   403  func (Int32Statistics) equal(a, b int32) bool                 { return a == b }
   404  func (Int64Statistics) equal(a, b int64) bool                 { return a == b }
   405  func (Float32Statistics) equal(a, b float32) bool             { return a == b }
   406  func (Float64Statistics) equal(a, b float64) bool             { return a == b }
   407  func (Int96Statistics) equal(a, b parquet.Int96) bool         { return bytes.Equal(a[:], b[:]) }
   408  func (ByteArrayStatistics) equal(a, b parquet.ByteArray) bool { return bytes.Equal(a, b) }
   409  func (FixedLenByteArrayStatistics) equal(a, b parquet.FixedLenByteArray) bool {
   410  	return bytes.Equal(a, b)
   411  }
   412  
   413  func (BooleanStatistics) less(a, b bool) bool {
   414  	return !a && b
   415  }
   416  
   417  func (s *Int32Statistics) less(a, b int32) bool {
   418  	if s.order == schema.SortUNSIGNED {
   419  		return uint32(a) < uint32(b)
   420  	}
   421  	return a < b
   422  }
   423  
   424  func (s *Int64Statistics) less(a, b int64) bool {
   425  	if s.order == schema.SortUNSIGNED {
   426  		return uint64(a) < uint64(b)
   427  	}
   428  	return a < b
   429  }
   430  func (Float32Statistics) less(a, b float32) bool { return a < b }
   431  func (Float64Statistics) less(a, b float64) bool { return a < b }
   432  func (s *Int96Statistics) less(a, b parquet.Int96) bool {
   433  	i96a := arrow.Uint32Traits.CastFromBytes(a[:])
   434  	i96b := arrow.Uint32Traits.CastFromBytes(b[:])
   435  	
   436  	a0, a1, a2 := utils.ToLEUint32(i96a[0]), utils.ToLEUint32(i96a[1]), utils.ToLEUint32(i96a[2])
   437  	b0, b1, b2 := utils.ToLEUint32(i96b[0]), utils.ToLEUint32(i96b[1]), utils.ToLEUint32(i96b[2])
   438  	
   439  	if a2 != b2 {
   440  		// only the msb bit is by signed comparison
   441  		if s.order == schema.SortSIGNED {
   442  			return int32(a2) < int32(b2)
   443  		}
   444  		return a2 < b2
   445  	} else if a1 != b1 {
   446  		return a1 < b1
   447  	}
   448  	return a0 < b0
   449  }
   450  
   451  func (s *ByteArrayStatistics) less(a, b parquet.ByteArray) bool {
   452  	if s.order == schema.SortUNSIGNED {
   453  		return bytes.Compare(a, b) == -1
   454  	}
   455  
   456  	return signedByteLess([]byte(a), []byte(b))
   457  }
   458  
   459  func (s *FixedLenByteArrayStatistics) less(a, b parquet.FixedLenByteArray) bool {
   460  	if s.order == schema.SortUNSIGNED {
   461  		return bytes.Compare(a, b) == -1
   462  	}
   463  
   464  	return signedByteLess([]byte(a), []byte(b))
   465  }
   466  
   467  func (BooleanStatistics) cleanStat(minMax minmaxPairBoolean) *minmaxPairBoolean { return &minMax }
   468  func (Int32Statistics) cleanStat(minMax minmaxPairInt32) *minmaxPairInt32       { return &minMax }
   469  func (Int64Statistics) cleanStat(minMax minmaxPairInt64) *minmaxPairInt64       { return &minMax }
   470  func (Int96Statistics) cleanStat(minMax minmaxPairInt96) *minmaxPairInt96       { return &minMax }
   471  
   472  // in the case of floating point types, the following rules are applied as per parquet-mr:
   473  // - if any of min/max is NaN, return nothing
   474  // - if min is 0.0f replace with -0.0f
   475  // - if max is -0.0f replace with 0.0f
   476  //
   477  // https://issues.apache.org/jira/browse/PARQUET-1222 tracks the official documenting of
   478  // a well-defined order for floats and doubles.
   479  func (Float32Statistics) cleanStat(minMax minmaxPairFloat32) *minmaxPairFloat32 {
   480  	if math.IsNaN(float64(minMax[0])) || math.IsNaN(float64(minMax[1])) {
   481  		return nil
   482  	}
   483  
   484  	if minMax[0] == math.MaxFloat32 && minMax[1] == -math.MaxFloat32 {
   485  		return nil
   486  	}
   487  
   488  	var zero float32 = 0
   489  	if minMax[0] == zero && !math.Signbit(float64(minMax[0])) {
   490  		minMax[0] = -minMax[0]
   491  	}
   492  
   493  	if minMax[1] == zero && math.Signbit(float64(minMax[1])) {
   494  		minMax[1] = -minMax[1]
   495  	}
   496  
   497  	return &minMax
   498  }
   499  
   500  func (Float64Statistics) cleanStat(minMax minmaxPairFloat64) *minmaxPairFloat64 {
   501  	if math.IsNaN(minMax[0]) || math.IsNaN(minMax[1]) {
   502  		return nil
   503  	}
   504  
   505  	if minMax[0] == math.MaxFloat64 && minMax[1] == -math.MaxFloat64 {
   506  		return nil
   507  	}
   508  
   509  	var zero float64 = 0
   510  	if minMax[0] == zero && !math.Signbit(minMax[0]) {
   511  		minMax[0] = -minMax[0]
   512  	}
   513  
   514  	if minMax[1] == zero && math.Signbit(minMax[1]) {
   515  		minMax[1] = -minMax[1]
   516  	}
   517  
   518  	return &minMax
   519  }
   520  
   521  func (ByteArrayStatistics) cleanStat(minMax minmaxPairByteArray) *minmaxPairByteArray {
   522  	if minMax[0] == nil || minMax[1] == nil {
   523  		return nil
   524  	}
   525  	return &minMax
   526  }
   527  
   528  func (FixedLenByteArrayStatistics) cleanStat(minMax minmaxPairFixedLenByteArray) *minmaxPairFixedLenByteArray {
   529  	if minMax[0] == nil || minMax[1] == nil {
   530  		return nil
   531  	}
   532  	return &minMax
   533  }
   534  
   535  func GetStatValue(typ parquet.Type, val []byte) interface{} {
   536  	switch typ {
   537  	case parquet.Types.Boolean:
   538  		return val[0] != 0
   539  	case parquet.Types.Int32:
   540  		return int32(binary.LittleEndian.Uint32(val))
   541  	case parquet.Types.Int64:
   542  		return int64(binary.LittleEndian.Uint64(val))
   543  	case parquet.Types.Int96:
   544  		p := parquet.Int96{}
   545  		copy(p[:], val)
   546  		return p
   547  	case parquet.Types.Float:
   548  		return math.Float32frombits(binary.LittleEndian.Uint32(val))
   549  	case parquet.Types.Double:
   550  		return math.Float64frombits(binary.LittleEndian.Uint64(val))
   551  	case parquet.Types.ByteArray:
   552  		fallthrough
   553  	case parquet.Types.FixedLenByteArray:
   554  		return val
   555  	}
   556  	return nil
   557  }