github.com/apache/arrow/go/v16@v16.1.0/parquet/metadata/statistics.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package metadata 18 19 import ( 20 "bytes" 21 "encoding/binary" 22 "math" 23 "unsafe" 24 25 "github.com/apache/arrow/go/v16/arrow" 26 "github.com/apache/arrow/go/v16/arrow/float16" 27 "github.com/apache/arrow/go/v16/arrow/memory" 28 "github.com/apache/arrow/go/v16/internal/utils" 29 "github.com/apache/arrow/go/v16/parquet" 30 "github.com/apache/arrow/go/v16/parquet/internal/debug" 31 "github.com/apache/arrow/go/v16/parquet/internal/encoding" 32 format "github.com/apache/arrow/go/v16/parquet/internal/gen-go/parquet" 33 "github.com/apache/arrow/go/v16/parquet/schema" 34 ) 35 36 //go:generate go run ../../arrow/_tools/tmpl/main.go -i -data=statistics_types.tmpldata statistics_types.gen.go.tmpl 37 38 type StatProvider interface { 39 GetMin() []byte 40 GetMax() []byte 41 GetNullCount() int64 42 GetDistinctCount() int64 43 IsSetMax() bool 44 IsSetMin() bool 45 IsSetNullCount() bool 46 IsSetDistinctCount() bool 47 } 48 49 // EncodedStatistics are raw statistics with encoded values that will be written 50 // to the parquet file, or was read from the parquet file. 51 type EncodedStatistics struct { 52 HasMax bool 53 Max []byte 54 HasMin bool 55 Min []byte 56 Signed bool 57 HasNullCount bool 58 NullCount int64 59 HasDistinctCount bool 60 DistinctCount int64 61 } 62 63 // ApplyStatSizeLimits sets the maximum size of the min/max values. 64 // 65 // from parquet-mr 66 // we don't write stats larger than the max size rather than truncating. 67 // the rationale is that some engines may use the minimum value in the page 68 // as the true minimum for aggregations and there is no way to mark that 69 // a value has been truncated and is a lower bound and not in the page 70 func (e *EncodedStatistics) ApplyStatSizeLimits(length int) { 71 if len(e.Max) > length { 72 e.HasMax = false 73 } 74 if len(e.Min) > length { 75 e.HasMin = false 76 } 77 } 78 79 // IsSet returns true iff one of the Has* values is true. 80 func (e *EncodedStatistics) IsSet() bool { 81 return e.HasMin || e.HasMax || e.HasNullCount || e.HasDistinctCount 82 } 83 84 // SetMax sets the encoded Max value to val and sets HasMax to true 85 func (e *EncodedStatistics) SetMax(val []byte) *EncodedStatistics { 86 e.Max = val[:] 87 e.HasMax = true 88 return e 89 } 90 91 // SetMin sets the encoded Min value to val, and sets HasMin to true 92 func (e *EncodedStatistics) SetMin(val []byte) *EncodedStatistics { 93 e.Min = val[:] 94 e.HasMin = true 95 return e 96 } 97 98 // SetNullCount sets the NullCount to val and sets HasNullCount to true 99 func (e *EncodedStatistics) SetNullCount(val int64) *EncodedStatistics { 100 e.NullCount = val 101 e.HasNullCount = true 102 return e 103 } 104 105 // SetDistinctCount sets the DistinctCount to val and sets HasDistinctCount to true 106 func (e *EncodedStatistics) SetDistinctCount(val int64) *EncodedStatistics { 107 e.DistinctCount = val 108 e.HasDistinctCount = true 109 return e 110 } 111 112 func (e *EncodedStatistics) ToThrift() (stats *format.Statistics) { 113 stats = format.NewStatistics() 114 if e.HasMin { 115 stats.MinValue = e.Min 116 // if sort order is SIGNED then the old min value must be set too for backwards compatibility 117 if e.Signed { 118 stats.Min = e.Min 119 } 120 } 121 if e.HasMax { 122 stats.MaxValue = e.Max 123 // if sort order is SIGNED then old max value must be set to 124 if e.Signed { 125 stats.Max = e.Max 126 } 127 } 128 if e.HasNullCount { 129 stats.NullCount = &e.NullCount 130 } 131 if e.HasDistinctCount { 132 stats.DistinctCount = &e.DistinctCount 133 } 134 return 135 } 136 137 // TypedStatistics is the base interface for dealing with stats as 138 // they are being populated 139 type TypedStatistics interface { 140 // Type is the underlying physical type for this stat block 141 Type() parquet.Type 142 // Returns true if there is a min and max value set for this stat object 143 HasMinMax() bool 144 // Returns true if a nullcount has been set 145 HasNullCount() bool 146 // returns true only if a distinct count has been set 147 // current implementation does of the writer does not automatically populate 148 // the distinct count right now. 149 HasDistinctCount() bool 150 NullCount() int64 151 DistinctCount() int64 152 NumValues() int64 153 // return the column descriptor that this stat object was initialized with 154 Descr() *schema.Column 155 156 // Encode the current min value and return the bytes. ByteArray does not 157 // include the len in the encoded bytes, otherwise this is identical to 158 // plain encoding 159 EncodeMin() []byte 160 // Encode the current max value and return the bytes. ByteArray does not 161 // include the len in the encoded bytes, otherwise this is identical to 162 // plain encoding 163 EncodeMax() []byte 164 // Populate an EncodedStatistics object from the current stats 165 Encode() (EncodedStatistics, error) 166 // Resets all values to 0 to enable reusing this stat object for multiple 167 // columns, by calling Encode to get the finished values and then calling 168 // reset 169 Reset() 170 // Merge the min/max/nullcounts and distinct count from the passed stat object 171 // into this one. 172 Merge(TypedStatistics) 173 174 // UpdateFromArrow updates the statistics from an Arrow Array, 175 // only updating the null and num value counts if updateCounts 176 // is true. 177 UpdateFromArrow(values arrow.Array, updateCounts bool) error 178 // IncNulls increments the number of nulls in the statistics 179 // and marks HasNullCount as true 180 IncNulls(int64) 181 // IncDistinct increments the number of distinct values in 182 // the statistics and marks HasDistinctCount as true 183 IncDistinct(int64) 184 // IncNumValues increments the total number of values in 185 // the statistics 186 IncNumValues(int64) 187 } 188 189 type statistics struct { 190 descr *schema.Column 191 hasMinMax bool 192 hasNullCount bool 193 hasDistinctCount bool 194 mem memory.Allocator 195 nvalues int64 196 stats EncodedStatistics 197 order schema.SortOrder 198 199 encoder encoding.TypedEncoder 200 } 201 202 func (s *statistics) IncNumValues(n int64) { 203 s.nvalues += n 204 } 205 func (s *statistics) IncNulls(n int64) { 206 s.stats.NullCount += n 207 s.hasNullCount = true 208 } 209 func (s *statistics) IncDistinct(n int64) { 210 s.stats.DistinctCount += n 211 s.hasDistinctCount = true 212 } 213 214 func (s *statistics) Descr() *schema.Column { return s.descr } 215 func (s *statistics) Type() parquet.Type { return s.descr.PhysicalType() } 216 func (s *statistics) HasDistinctCount() bool { return s.hasDistinctCount } 217 func (s *statistics) HasMinMax() bool { return s.hasMinMax } 218 func (s *statistics) HasNullCount() bool { return s.hasNullCount } 219 func (s *statistics) NullCount() int64 { return s.stats.NullCount } 220 func (s *statistics) DistinctCount() int64 { return s.stats.DistinctCount } 221 func (s *statistics) NumValues() int64 { return s.nvalues } 222 223 func (s *statistics) Reset() { 224 s.stats.NullCount = 0 225 s.stats.DistinctCount = 0 226 s.nvalues = 0 227 s.hasMinMax = false 228 s.hasDistinctCount = false 229 s.hasNullCount = false 230 } 231 232 // base merge function for base non-typed stat object so we don't have to 233 // duplicate this in each of the typed implementations 234 func (s *statistics) merge(other TypedStatistics) { 235 s.nvalues += other.NumValues() 236 if other.HasNullCount() { 237 s.stats.NullCount += other.NullCount() 238 } 239 if other.HasDistinctCount() { 240 // this isn't technically correct as it should be keeping an actual set 241 // of the distinct values and then combining the sets to get a new count 242 // but for now we'll do this to match the C++ implementation at the current 243 // time. 244 s.stats.DistinctCount += other.DistinctCount() 245 } 246 } 247 248 func coalesce(val, fallback interface{}) interface{} { 249 switch v := val.(type) { 250 case float32: 251 if math.IsNaN(float64(v)) { 252 return fallback 253 } 254 case float64: 255 if math.IsNaN(v) { 256 return fallback 257 } 258 } 259 return val 260 } 261 262 func signedByteLess(a, b []byte) bool { 263 // signed comparison is used for integers encoded as big-endian twos complement 264 // integers (e.g. decimals) 265 266 // if at least one of the lengths is zero, we can short circuit 267 if len(a) == 0 || len(b) == 0 { 268 return len(a) == 0 && len(b) > 0 269 } 270 271 sa := *(*[]int8)(unsafe.Pointer(&a)) 272 sb := *(*[]int8)(unsafe.Pointer(&b)) 273 274 // we can short circuit for different signed numbers or for equal length byte 275 // arrays that have different first bytes. The equality requirement is necessary 276 // for sign extension cases. 0xFF10 should be equal to 0x10 (due to big endian sign extension) 277 if int8(0x80&uint8(sa[0])) != int8(0x80&uint8(sb[0])) || (len(sa) == len(sb) && sa[0] != sb[0]) { 278 return sa[0] < sb[0] 279 } 280 281 // when the lengths are unequal and the numbers are of the same sign, we need 282 // to do comparison by sign extending the shorter value first, and once we get 283 // to equal sized arrays, lexicographical unsigned comparison of everything but 284 // the first byte is sufficient. 285 286 if len(a) != len(b) { 287 var lead []byte 288 if len(a) > len(b) { 289 leadLen := len(a) - len(b) 290 lead = a[:leadLen] 291 a = a[leadLen:] 292 } else { 293 debug.Assert(len(a) < len(b), "something weird in byte slice signed comparison") 294 leadLen := len(b) - len(a) 295 lead = b[:leadLen] 296 b = b[leadLen:] 297 } 298 299 // compare extra bytes to the sign extension of the first byte of the other number 300 var extension byte 301 if sa[0] < 0 { 302 extension = 0xFF 303 } 304 305 notequal := false 306 for _, c := range lead { 307 if c != extension { 308 notequal = true 309 break 310 } 311 } 312 313 if notequal { 314 // since sign extension are extrema values for unsigned bytes: 315 // 316 // Four cases exist: 317 // negative values: 318 // b is the longer value 319 // b must be the lesser value: return false 320 // else: 321 // a must be the lesser value: return true 322 // 323 // positive values: 324 // b is the longer value 325 // values in b must be greater than a: return true 326 // else: 327 // values in a must be greater than b: return false 328 neg := sa[0] < 0 329 blonger := len(sa) < len(sb) 330 return neg != blonger 331 } 332 } else { 333 a = a[1:] 334 b = b[1:] 335 } 336 337 return bytes.Compare(a, b) == -1 338 } 339 340 func (BooleanStatistics) defaultMin() bool { return true } 341 func (BooleanStatistics) defaultMax() bool { return false } 342 func (s *Int32Statistics) defaultMin() int32 { 343 if s.order == schema.SortUNSIGNED { 344 val := uint32(math.MaxUint32) 345 return int32(val) 346 } 347 return math.MaxInt32 348 } 349 350 func (s *Int32Statistics) defaultMax() int32 { 351 if s.order == schema.SortUNSIGNED { 352 return int32(0) 353 } 354 return math.MinInt32 355 } 356 357 func (s *Int64Statistics) defaultMin() int64 { 358 if s.order == schema.SortUNSIGNED { 359 val := uint64(math.MaxUint64) 360 return int64(val) 361 } 362 return math.MaxInt64 363 } 364 365 func (s *Int64Statistics) defaultMax() int64 { 366 if s.order == schema.SortUNSIGNED { 367 return int64(0) 368 } 369 return math.MinInt64 370 } 371 372 var ( 373 defaultMinInt96 parquet.Int96 374 defaultMinUInt96 parquet.Int96 375 defaultMaxInt96 parquet.Int96 376 defaultMaxUInt96 parquet.Int96 377 378 defaultMinFloat16 parquet.FixedLenByteArray = float16.MaxNum.ToLEBytes() 379 defaultMaxFloat16 parquet.FixedLenByteArray = float16.MinNum.ToLEBytes() 380 ) 381 382 func init() { 383 i96 := arrow.Uint32Traits.CastFromBytes(defaultMinInt96[:]) 384 i96[0] = math.MaxUint32 385 i96[1] = math.MaxUint32 386 i96[2] = math.MaxInt32 387 388 i96 = arrow.Uint32Traits.CastFromBytes(defaultMinUInt96[:]) 389 i96[0] = math.MaxUint32 390 i96[1] = math.MaxUint32 391 i96[2] = math.MaxUint32 392 393 // golang will initialize the bytes to 0 394 i96 = arrow.Uint32Traits.CastFromBytes(defaultMaxInt96[:]) 395 i96[2] = math.MaxInt32 + 1 396 397 // defaultMaxUInt96 will be initialized to 0 as desired 398 } 399 400 func (s *Int96Statistics) defaultMin() parquet.Int96 { 401 if s.order == schema.SortUNSIGNED { 402 return defaultMinUInt96 403 } 404 return defaultMinInt96 405 } 406 407 func (s *Int96Statistics) defaultMax() parquet.Int96 { 408 if s.order == schema.SortUNSIGNED { 409 return defaultMaxUInt96 410 } 411 return defaultMaxInt96 412 } 413 414 func (Float16Statistics) defaultMin() parquet.FixedLenByteArray { 415 return defaultMinFloat16 416 } 417 418 func (Float16Statistics) defaultMax() parquet.FixedLenByteArray { 419 return defaultMaxFloat16 420 } 421 422 func (Float32Statistics) defaultMin() float32 { return math.MaxFloat32 } 423 func (Float32Statistics) defaultMax() float32 { return -math.MaxFloat32 } 424 func (Float64Statistics) defaultMin() float64 { return math.MaxFloat64 } 425 func (Float64Statistics) defaultMax() float64 { return -math.MaxFloat64 } 426 func (ByteArrayStatistics) defaultMin() parquet.ByteArray { return nil } 427 func (ByteArrayStatistics) defaultMax() parquet.ByteArray { return nil } 428 func (FixedLenByteArrayStatistics) defaultMin() parquet.FixedLenByteArray { return nil } 429 func (FixedLenByteArrayStatistics) defaultMax() parquet.FixedLenByteArray { return nil } 430 431 func (BooleanStatistics) equal(a, b bool) bool { return a == b } 432 func (Int32Statistics) equal(a, b int32) bool { return a == b } 433 func (Int64Statistics) equal(a, b int64) bool { return a == b } 434 func (Float32Statistics) equal(a, b float32) bool { return a == b } 435 func (Float64Statistics) equal(a, b float64) bool { return a == b } 436 func (Int96Statistics) equal(a, b parquet.Int96) bool { return bytes.Equal(a[:], b[:]) } 437 func (ByteArrayStatistics) equal(a, b parquet.ByteArray) bool { return bytes.Equal(a, b) } 438 func (FixedLenByteArrayStatistics) equal(a, b parquet.FixedLenByteArray) bool { 439 return bytes.Equal(a, b) 440 } 441 442 func (Float16Statistics) equal(a, b parquet.FixedLenByteArray) bool { 443 return float16.FromLEBytes(a).Equal(float16.FromLEBytes(b)) 444 } 445 446 func (BooleanStatistics) less(a, b bool) bool { 447 return !a && b 448 } 449 450 func (s *Int32Statistics) less(a, b int32) bool { 451 if s.order == schema.SortUNSIGNED { 452 return uint32(a) < uint32(b) 453 } 454 return a < b 455 } 456 457 func (s *Int64Statistics) less(a, b int64) bool { 458 if s.order == schema.SortUNSIGNED { 459 return uint64(a) < uint64(b) 460 } 461 return a < b 462 } 463 func (Float32Statistics) less(a, b float32) bool { return a < b } 464 func (Float64Statistics) less(a, b float64) bool { return a < b } 465 func (s *Int96Statistics) less(a, b parquet.Int96) bool { 466 i96a := arrow.Uint32Traits.CastFromBytes(a[:]) 467 i96b := arrow.Uint32Traits.CastFromBytes(b[:]) 468 469 a0, a1, a2 := utils.ToLEUint32(i96a[0]), utils.ToLEUint32(i96a[1]), utils.ToLEUint32(i96a[2]) 470 b0, b1, b2 := utils.ToLEUint32(i96b[0]), utils.ToLEUint32(i96b[1]), utils.ToLEUint32(i96b[2]) 471 472 if a2 != b2 { 473 // only the msb bit is by signed comparison 474 if s.order == schema.SortSIGNED { 475 return int32(a2) < int32(b2) 476 } 477 return a2 < b2 478 } else if a1 != b1 { 479 return a1 < b1 480 } 481 return a0 < b0 482 } 483 484 func (s *ByteArrayStatistics) less(a, b parquet.ByteArray) bool { 485 if s.order == schema.SortUNSIGNED { 486 return bytes.Compare(a, b) == -1 487 } 488 489 return signedByteLess([]byte(a), []byte(b)) 490 } 491 492 func (s *FixedLenByteArrayStatistics) less(a, b parquet.FixedLenByteArray) bool { 493 if s.order == schema.SortUNSIGNED { 494 return bytes.Compare(a, b) == -1 495 } 496 497 return signedByteLess([]byte(a), []byte(b)) 498 } 499 500 func (Float16Statistics) less(a, b parquet.FixedLenByteArray) bool { 501 return float16.FromLEBytes(a).Less(float16.FromLEBytes(b)) 502 } 503 504 func (BooleanStatistics) cleanStat(minMax minmaxPairBoolean) *minmaxPairBoolean { return &minMax } 505 func (Int32Statistics) cleanStat(minMax minmaxPairInt32) *minmaxPairInt32 { return &minMax } 506 func (Int64Statistics) cleanStat(minMax minmaxPairInt64) *minmaxPairInt64 { return &minMax } 507 func (Int96Statistics) cleanStat(minMax minmaxPairInt96) *minmaxPairInt96 { return &minMax } 508 509 // in the case of floating point types, the following rules are applied as per parquet-mr: 510 // - if any of min/max is NaN, return nothing 511 // - if min is 0.0f replace with -0.0f 512 // - if max is -0.0f replace with 0.0f 513 // 514 // https://issues.apache.org/jira/browse/PARQUET-1222 tracks the official documenting of 515 // a well-defined order for floats and doubles. 516 func (Float32Statistics) cleanStat(minMax minmaxPairFloat32) *minmaxPairFloat32 { 517 if math.IsNaN(float64(minMax[0])) || math.IsNaN(float64(minMax[1])) { 518 return nil 519 } 520 521 if minMax[0] == math.MaxFloat32 && minMax[1] == -math.MaxFloat32 { 522 return nil 523 } 524 525 var zero float32 = 0 526 if minMax[0] == zero && !math.Signbit(float64(minMax[0])) { 527 minMax[0] = -minMax[0] 528 } 529 530 if minMax[1] == zero && math.Signbit(float64(minMax[1])) { 531 minMax[1] = -minMax[1] 532 } 533 534 return &minMax 535 } 536 537 func (Float64Statistics) cleanStat(minMax minmaxPairFloat64) *minmaxPairFloat64 { 538 if math.IsNaN(minMax[0]) || math.IsNaN(minMax[1]) { 539 return nil 540 } 541 542 if minMax[0] == math.MaxFloat64 && minMax[1] == -math.MaxFloat64 { 543 return nil 544 } 545 546 var zero float64 = 0 547 if minMax[0] == zero && !math.Signbit(minMax[0]) { 548 minMax[0] = -minMax[0] 549 } 550 551 if minMax[1] == zero && math.Signbit(minMax[1]) { 552 minMax[1] = -minMax[1] 553 } 554 555 return &minMax 556 } 557 558 func (Float16Statistics) cleanStat(minMax minmaxPairFloat16) *minmaxPairFloat16 { 559 min := float16.FromLEBytes(minMax[0][:]) 560 max := float16.FromLEBytes(minMax[1][:]) 561 562 if min.IsNaN() || max.IsNaN() { 563 return nil 564 } 565 566 if min.Equal(float16.MaxNum) && max.Equal(float16.MinNum) { 567 return nil 568 } 569 570 zero := float16.New(0) 571 if min.Equal(zero) && !min.Signbit() { 572 minMax[0] = min.Negate().ToLEBytes() 573 } 574 if max.Equal(zero) && max.Signbit() { 575 minMax[1] = max.Negate().ToLEBytes() 576 } 577 578 return &minMax 579 } 580 581 func (ByteArrayStatistics) cleanStat(minMax minmaxPairByteArray) *minmaxPairByteArray { 582 if minMax[0] == nil || minMax[1] == nil { 583 return nil 584 } 585 return &minMax 586 } 587 588 func (FixedLenByteArrayStatistics) cleanStat(minMax minmaxPairFixedLenByteArray) *minmaxPairFixedLenByteArray { 589 if minMax[0] == nil || minMax[1] == nil { 590 return nil 591 } 592 return &minMax 593 } 594 595 func GetStatValue(typ parquet.Type, val []byte) interface{} { 596 switch typ { 597 case parquet.Types.Boolean: 598 return val[0] != 0 599 case parquet.Types.Int32: 600 return int32(binary.LittleEndian.Uint32(val)) 601 case parquet.Types.Int64: 602 return int64(binary.LittleEndian.Uint64(val)) 603 case parquet.Types.Int96: 604 p := parquet.Int96{} 605 copy(p[:], val) 606 return p 607 case parquet.Types.Float: 608 return math.Float32frombits(binary.LittleEndian.Uint32(val)) 609 case parquet.Types.Double: 610 return math.Float64frombits(binary.LittleEndian.Uint64(val)) 611 case parquet.Types.ByteArray: 612 fallthrough 613 case parquet.Types.FixedLenByteArray: 614 return val 615 } 616 return nil 617 }