github.com/apache/arrow/go/v14@v14.0.2/parquet/metadata/statistics.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package metadata 18 19 import ( 20 "bytes" 21 "encoding/binary" 22 "math" 23 "unsafe" 24 25 "github.com/apache/arrow/go/v14/arrow" 26 "github.com/apache/arrow/go/v14/arrow/memory" 27 "github.com/apache/arrow/go/v14/internal/utils" 28 "github.com/apache/arrow/go/v14/parquet" 29 "github.com/apache/arrow/go/v14/parquet/internal/debug" 30 "github.com/apache/arrow/go/v14/parquet/internal/encoding" 31 format "github.com/apache/arrow/go/v14/parquet/internal/gen-go/parquet" 32 "github.com/apache/arrow/go/v14/parquet/schema" 33 ) 34 35 //go:generate go run ../../arrow/_tools/tmpl/main.go -i -data=../internal/encoding/physical_types.tmpldata statistics_types.gen.go.tmpl 36 37 type StatProvider interface { 38 GetMin() []byte 39 GetMax() []byte 40 GetNullCount() int64 41 GetDistinctCount() int64 42 IsSetMax() bool 43 IsSetMin() bool 44 IsSetNullCount() bool 45 IsSetDistinctCount() bool 46 } 47 48 // EncodedStatistics are raw statistics with encoded values that will be written 49 // to the parquet file, or was read from the parquet file. 50 type EncodedStatistics struct { 51 HasMax bool 52 Max []byte 53 HasMin bool 54 Min []byte 55 Signed bool 56 HasNullCount bool 57 NullCount int64 58 HasDistinctCount bool 59 DistinctCount int64 60 } 61 62 // ApplyStatSizeLimits sets the maximum size of the min/max values. 63 // 64 // from parquet-mr 65 // we don't write stats larger than the max size rather than truncating. 66 // the rationale is that some engines may use the minimum value in the page 67 // as the true minimum for aggregations and there is no way to mark that 68 // a value has been truncated and is a lower bound and not in the page 69 func (e *EncodedStatistics) ApplyStatSizeLimits(length int) { 70 if len(e.Max) > length { 71 e.HasMax = false 72 } 73 if len(e.Min) > length { 74 e.HasMin = false 75 } 76 } 77 78 // IsSet returns true iff one of the Has* values is true. 79 func (e *EncodedStatistics) IsSet() bool { 80 return e.HasMin || e.HasMax || e.HasNullCount || e.HasDistinctCount 81 } 82 83 // SetMax sets the encoded Max value to val and sets HasMax to true 84 func (e *EncodedStatistics) SetMax(val []byte) *EncodedStatistics { 85 e.Max = val[:] 86 e.HasMax = true 87 return e 88 } 89 90 // SetMin sets the encoded Min value to val, and sets HasMin to true 91 func (e *EncodedStatistics) SetMin(val []byte) *EncodedStatistics { 92 e.Min = val[:] 93 e.HasMin = true 94 return e 95 } 96 97 // SetNullCount sets the NullCount to val and sets HasNullCount to true 98 func (e *EncodedStatistics) SetNullCount(val int64) *EncodedStatistics { 99 e.NullCount = val 100 e.HasNullCount = true 101 return e 102 } 103 104 // SetDistinctCount sets the DistinctCount to val and sets HasDistinctCount to true 105 func (e *EncodedStatistics) SetDistinctCount(val int64) *EncodedStatistics { 106 e.DistinctCount = val 107 e.HasDistinctCount = true 108 return e 109 } 110 111 func (e *EncodedStatistics) ToThrift() (stats *format.Statistics) { 112 stats = format.NewStatistics() 113 if e.HasMin { 114 stats.MinValue = e.Min 115 // if sort order is SIGNED then the old min value must be set too for backwards compatibility 116 if e.Signed { 117 stats.Min = e.Min 118 } 119 } 120 if e.HasMax { 121 stats.MaxValue = e.Max 122 // if sort order is SIGNED then old max value must be set to 123 if e.Signed { 124 stats.Max = e.Max 125 } 126 } 127 if e.HasNullCount { 128 stats.NullCount = &e.NullCount 129 } 130 if e.HasDistinctCount { 131 stats.DistinctCount = &e.DistinctCount 132 } 133 return 134 } 135 136 // TypedStatistics is the base interface for dealing with stats as 137 // they are being populated 138 type TypedStatistics interface { 139 // Type is the underlying physical type for this stat block 140 Type() parquet.Type 141 // Returns true if there is a min and max value set for this stat object 142 HasMinMax() bool 143 // Returns true if a nullcount has been set 144 HasNullCount() bool 145 // returns true only if a distinct count has been set 146 // current implementation does of the writer does not automatically populate 147 // the distinct count right now. 148 HasDistinctCount() bool 149 NullCount() int64 150 DistinctCount() int64 151 NumValues() int64 152 // return the column descriptor that this stat object was initialized with 153 Descr() *schema.Column 154 155 // Encode the current min value and return the bytes. ByteArray does not 156 // include the len in the encoded bytes, otherwise this is identical to 157 // plain encoding 158 EncodeMin() []byte 159 // Encode the current max value and return the bytes. ByteArray does not 160 // include the len in the encoded bytes, otherwise this is identical to 161 // plain encoding 162 EncodeMax() []byte 163 // Populate an EncodedStatistics object from the current stats 164 Encode() (EncodedStatistics, error) 165 // Resets all values to 0 to enable reusing this stat object for multiple 166 // columns, by calling Encode to get the finished values and then calling 167 // reset 168 Reset() 169 // Merge the min/max/nullcounts and distinct count from the passed stat object 170 // into this one. 171 Merge(TypedStatistics) 172 173 // UpdateFromArrow updates the statistics from an Arrow Array, 174 // only updating the null and num value counts if updateCounts 175 // is true. 176 UpdateFromArrow(values arrow.Array, updateCounts bool) error 177 // IncNulls increments the number of nulls in the statistics 178 // and marks HasNullCount as true 179 IncNulls(int64) 180 // IncDistinct increments the number of distinct values in 181 // the statistics and marks HasDistinctCount as true 182 IncDistinct(int64) 183 // IncNumValues increments the total number of values in 184 // the statistics 185 IncNumValues(int64) 186 } 187 188 type statistics struct { 189 descr *schema.Column 190 hasMinMax bool 191 hasNullCount bool 192 hasDistinctCount bool 193 mem memory.Allocator 194 nvalues int64 195 stats EncodedStatistics 196 order schema.SortOrder 197 198 encoder encoding.TypedEncoder 199 } 200 201 func (s *statistics) IncNumValues(n int64) { 202 s.nvalues += n 203 } 204 func (s *statistics) IncNulls(n int64) { 205 s.stats.NullCount += n 206 s.hasNullCount = true 207 } 208 func (s *statistics) IncDistinct(n int64) { 209 s.stats.DistinctCount += n 210 s.hasDistinctCount = true 211 } 212 213 func (s *statistics) Descr() *schema.Column { return s.descr } 214 func (s *statistics) Type() parquet.Type { return s.descr.PhysicalType() } 215 func (s *statistics) HasDistinctCount() bool { return s.hasDistinctCount } 216 func (s *statistics) HasMinMax() bool { return s.hasMinMax } 217 func (s *statistics) HasNullCount() bool { return s.hasNullCount } 218 func (s *statistics) NullCount() int64 { return s.stats.NullCount } 219 func (s *statistics) DistinctCount() int64 { return s.stats.DistinctCount } 220 func (s *statistics) NumValues() int64 { return s.nvalues } 221 222 func (s *statistics) Reset() { 223 s.stats.NullCount = 0 224 s.stats.DistinctCount = 0 225 s.nvalues = 0 226 s.hasMinMax = false 227 s.hasDistinctCount = false 228 s.hasNullCount = false 229 } 230 231 // base merge function for base non-typed stat object so we don't have to 232 // duplicate this in each of the typed implementations 233 func (s *statistics) merge(other TypedStatistics) { 234 s.nvalues += other.NumValues() 235 if other.HasNullCount() { 236 s.stats.NullCount += other.NullCount() 237 } 238 if other.HasDistinctCount() { 239 // this isn't technically correct as it should be keeping an actual set 240 // of the distinct values and then combining the sets to get a new count 241 // but for now we'll do this to match the C++ implementation at the current 242 // time. 243 s.stats.DistinctCount += other.DistinctCount() 244 } 245 } 246 247 func coalesce(val, fallback interface{}) interface{} { 248 switch v := val.(type) { 249 case float32: 250 if math.IsNaN(float64(v)) { 251 return fallback 252 } 253 case float64: 254 if math.IsNaN(v) { 255 return fallback 256 } 257 } 258 return val 259 } 260 261 func signedByteLess(a, b []byte) bool { 262 // signed comparison is used for integers encoded as big-endian twos complement 263 // integers (e.g. decimals) 264 265 // if at least one of the lengths is zero, we can short circuit 266 if len(a) == 0 || len(b) == 0 { 267 return len(a) == 0 && len(b) > 0 268 } 269 270 sa := *(*[]int8)(unsafe.Pointer(&a)) 271 sb := *(*[]int8)(unsafe.Pointer(&b)) 272 273 // we can short circuit for different signd numbers or for equal length byte 274 // arrays that have different first bytes. The equality requirement is necessary 275 // for sign extension cases. 0xFF10 should be equal to 0x10 (due to big endian sign extension) 276 if int8(0x80&uint8(sa[0])) != int8(0x80&uint8(sb[0])) || (len(sa) == len(sb) && sa[0] != sb[0]) { 277 return sa[0] < sb[0] 278 } 279 280 // when the lengths are unequal and the numbers are of the same sign, we need 281 // to do comparison by sign extending the shorter value first, and once we get 282 // to equal sized arrays, lexicographical unsigned comparison of everything but 283 // the first byte is sufficient. 284 285 if len(a) != len(b) { 286 var lead []byte 287 if len(a) > len(b) { 288 leadLen := len(a) - len(b) 289 lead = a[:leadLen] 290 a = a[leadLen:] 291 } else { 292 debug.Assert(len(a) < len(b), "something weird in byte slice signed comparison") 293 leadLen := len(b) - len(a) 294 lead = b[:leadLen] 295 b = b[leadLen:] 296 } 297 298 // compare extra bytes to the sign extension of the first byte of the other number 299 var extension byte 300 if sa[0] < 0 { 301 extension = 0xFF 302 } 303 304 notequal := false 305 for _, c := range lead { 306 if c != extension { 307 notequal = true 308 break 309 } 310 } 311 312 if notequal { 313 // since sign extension are extrema values for unsigned bytes: 314 // 315 // Four cases exist: 316 // negative values: 317 // b is the longer value 318 // b must be the lesser value: return false 319 // else: 320 // a must be the lesser value: return true 321 // 322 // positive values: 323 // b is the longer value 324 // values in b must be greater than a: return true 325 // else: 326 // values in a must be greater than b: return false 327 neg := sa[0] < 0 328 blonger := len(sa) < len(sb) 329 return neg != blonger 330 } 331 } else { 332 a = a[1:] 333 b = b[1:] 334 } 335 336 return bytes.Compare(a, b) == -1 337 } 338 339 func (BooleanStatistics) defaultMin() bool { return true } 340 func (BooleanStatistics) defaultMax() bool { return false } 341 func (s *Int32Statistics) defaultMin() int32 { 342 if s.order == schema.SortUNSIGNED { 343 val := uint32(math.MaxUint32) 344 return int32(val) 345 } 346 return math.MaxInt32 347 } 348 349 func (s *Int32Statistics) defaultMax() int32 { 350 if s.order == schema.SortUNSIGNED { 351 return int32(0) 352 } 353 return math.MinInt32 354 } 355 356 func (s *Int64Statistics) defaultMin() int64 { 357 if s.order == schema.SortUNSIGNED { 358 val := uint64(math.MaxUint64) 359 return int64(val) 360 } 361 return math.MaxInt64 362 } 363 364 func (s *Int64Statistics) defaultMax() int64 { 365 if s.order == schema.SortUNSIGNED { 366 return int64(0) 367 } 368 return math.MinInt64 369 } 370 371 var ( 372 defaultMinInt96 parquet.Int96 373 defaultMinUInt96 parquet.Int96 374 defaultMaxInt96 parquet.Int96 375 defaultMaxUInt96 parquet.Int96 376 ) 377 378 func init() { 379 i96 := arrow.Uint32Traits.CastFromBytes(defaultMinInt96[:]) 380 i96[0] = math.MaxUint32 381 i96[1] = math.MaxUint32 382 i96[2] = math.MaxInt32 383 384 i96 = arrow.Uint32Traits.CastFromBytes(defaultMinUInt96[:]) 385 i96[0] = math.MaxUint32 386 i96[1] = math.MaxUint32 387 i96[2] = math.MaxUint32 388 389 // golang will initialize the bytes to 0 390 i96 = arrow.Uint32Traits.CastFromBytes(defaultMaxInt96[:]) 391 i96[2] = math.MaxInt32 + 1 392 393 // defaultMaxUInt96 will be initialized to 0 as desired 394 } 395 396 func (s *Int96Statistics) defaultMin() parquet.Int96 { 397 if s.order == schema.SortUNSIGNED { 398 return defaultMinUInt96 399 } 400 return defaultMinInt96 401 } 402 403 func (s *Int96Statistics) defaultMax() parquet.Int96 { 404 if s.order == schema.SortUNSIGNED { 405 return defaultMaxUInt96 406 } 407 return defaultMaxInt96 408 } 409 410 func (Float32Statistics) defaultMin() float32 { return math.MaxFloat32 } 411 func (Float32Statistics) defaultMax() float32 { return -math.MaxFloat32 } 412 func (Float64Statistics) defaultMin() float64 { return math.MaxFloat64 } 413 func (Float64Statistics) defaultMax() float64 { return -math.MaxFloat64 } 414 func (ByteArrayStatistics) defaultMin() parquet.ByteArray { return nil } 415 func (ByteArrayStatistics) defaultMax() parquet.ByteArray { return nil } 416 func (FixedLenByteArrayStatistics) defaultMin() parquet.FixedLenByteArray { return nil } 417 func (FixedLenByteArrayStatistics) defaultMax() parquet.FixedLenByteArray { return nil } 418 419 func (BooleanStatistics) equal(a, b bool) bool { return a == b } 420 func (Int32Statistics) equal(a, b int32) bool { return a == b } 421 func (Int64Statistics) equal(a, b int64) bool { return a == b } 422 func (Float32Statistics) equal(a, b float32) bool { return a == b } 423 func (Float64Statistics) equal(a, b float64) bool { return a == b } 424 func (Int96Statistics) equal(a, b parquet.Int96) bool { return bytes.Equal(a[:], b[:]) } 425 func (ByteArrayStatistics) equal(a, b parquet.ByteArray) bool { return bytes.Equal(a, b) } 426 func (FixedLenByteArrayStatistics) equal(a, b parquet.FixedLenByteArray) bool { 427 return bytes.Equal(a, b) 428 } 429 430 func (BooleanStatistics) less(a, b bool) bool { 431 return !a && b 432 } 433 434 func (s *Int32Statistics) less(a, b int32) bool { 435 if s.order == schema.SortUNSIGNED { 436 return uint32(a) < uint32(b) 437 } 438 return a < b 439 } 440 441 func (s *Int64Statistics) less(a, b int64) bool { 442 if s.order == schema.SortUNSIGNED { 443 return uint64(a) < uint64(b) 444 } 445 return a < b 446 } 447 func (Float32Statistics) less(a, b float32) bool { return a < b } 448 func (Float64Statistics) less(a, b float64) bool { return a < b } 449 func (s *Int96Statistics) less(a, b parquet.Int96) bool { 450 i96a := arrow.Uint32Traits.CastFromBytes(a[:]) 451 i96b := arrow.Uint32Traits.CastFromBytes(b[:]) 452 453 a0, a1, a2 := utils.ToLEUint32(i96a[0]), utils.ToLEUint32(i96a[1]), utils.ToLEUint32(i96a[2]) 454 b0, b1, b2 := utils.ToLEUint32(i96b[0]), utils.ToLEUint32(i96b[1]), utils.ToLEUint32(i96b[2]) 455 456 if a2 != b2 { 457 // only the msb bit is by signed comparison 458 if s.order == schema.SortSIGNED { 459 return int32(a2) < int32(b2) 460 } 461 return a2 < b2 462 } else if a1 != b1 { 463 return a1 < b1 464 } 465 return a0 < b0 466 } 467 468 func (s *ByteArrayStatistics) less(a, b parquet.ByteArray) bool { 469 if s.order == schema.SortUNSIGNED { 470 return bytes.Compare(a, b) == -1 471 } 472 473 return signedByteLess([]byte(a), []byte(b)) 474 } 475 476 func (s *FixedLenByteArrayStatistics) less(a, b parquet.FixedLenByteArray) bool { 477 if s.order == schema.SortUNSIGNED { 478 return bytes.Compare(a, b) == -1 479 } 480 481 return signedByteLess([]byte(a), []byte(b)) 482 } 483 484 func (BooleanStatistics) cleanStat(minMax minmaxPairBoolean) *minmaxPairBoolean { return &minMax } 485 func (Int32Statistics) cleanStat(minMax minmaxPairInt32) *minmaxPairInt32 { return &minMax } 486 func (Int64Statistics) cleanStat(minMax minmaxPairInt64) *minmaxPairInt64 { return &minMax } 487 func (Int96Statistics) cleanStat(minMax minmaxPairInt96) *minmaxPairInt96 { return &minMax } 488 489 // in the case of floating point types, the following rules are applied as per parquet-mr: 490 // - if any of min/max is NaN, return nothing 491 // - if min is 0.0f replace with -0.0f 492 // - if max is -0.0f replace with 0.0f 493 // 494 // https://issues.apache.org/jira/browse/PARQUET-1222 tracks the official documenting of 495 // a well-defined order for floats and doubles. 496 func (Float32Statistics) cleanStat(minMax minmaxPairFloat32) *minmaxPairFloat32 { 497 if math.IsNaN(float64(minMax[0])) || math.IsNaN(float64(minMax[1])) { 498 return nil 499 } 500 501 if minMax[0] == math.MaxFloat32 && minMax[1] == -math.MaxFloat32 { 502 return nil 503 } 504 505 var zero float32 = 0 506 if minMax[0] == zero && !math.Signbit(float64(minMax[0])) { 507 minMax[0] = -minMax[0] 508 } 509 510 if minMax[1] == zero && math.Signbit(float64(minMax[1])) { 511 minMax[1] = -minMax[1] 512 } 513 514 return &minMax 515 } 516 517 func (Float64Statistics) cleanStat(minMax minmaxPairFloat64) *minmaxPairFloat64 { 518 if math.IsNaN(minMax[0]) || math.IsNaN(minMax[1]) { 519 return nil 520 } 521 522 if minMax[0] == math.MaxFloat64 && minMax[1] == -math.MaxFloat64 { 523 return nil 524 } 525 526 var zero float64 = 0 527 if minMax[0] == zero && !math.Signbit(minMax[0]) { 528 minMax[0] = -minMax[0] 529 } 530 531 if minMax[1] == zero && math.Signbit(minMax[1]) { 532 minMax[1] = -minMax[1] 533 } 534 535 return &minMax 536 } 537 538 func (ByteArrayStatistics) cleanStat(minMax minmaxPairByteArray) *minmaxPairByteArray { 539 if minMax[0] == nil || minMax[1] == nil { 540 return nil 541 } 542 return &minMax 543 } 544 545 func (FixedLenByteArrayStatistics) cleanStat(minMax minmaxPairFixedLenByteArray) *minmaxPairFixedLenByteArray { 546 if minMax[0] == nil || minMax[1] == nil { 547 return nil 548 } 549 return &minMax 550 } 551 552 func GetStatValue(typ parquet.Type, val []byte) interface{} { 553 switch typ { 554 case parquet.Types.Boolean: 555 return val[0] != 0 556 case parquet.Types.Int32: 557 return int32(binary.LittleEndian.Uint32(val)) 558 case parquet.Types.Int64: 559 return int64(binary.LittleEndian.Uint64(val)) 560 case parquet.Types.Int96: 561 p := parquet.Int96{} 562 copy(p[:], val) 563 return p 564 case parquet.Types.Float: 565 return math.Float32frombits(binary.LittleEndian.Uint32(val)) 566 case parquet.Types.Double: 567 return math.Float64frombits(binary.LittleEndian.Uint64(val)) 568 case parquet.Types.ByteArray: 569 fallthrough 570 case parquet.Types.FixedLenByteArray: 571 return val 572 } 573 return nil 574 }