github.com/matrixorigin/matrixone@v1.2.0/pkg/sort/sort.go (about) 1 // Copyright 2021 Matrix Origin 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package sort 16 17 import ( 18 "bytes" 19 "math/bits" 20 21 "github.com/matrixorigin/matrixone/pkg/container/nulls" 22 "github.com/matrixorigin/matrixone/pkg/container/types" 23 "github.com/matrixorigin/matrixone/pkg/container/vector" 24 "github.com/matrixorigin/matrixone/pkg/vectorize/moarray" 25 ) 26 27 const ( 28 unknownHint sortedHint = iota 29 increasingHint 30 decreasingHint 31 ) 32 33 type xorshift uint64 34 type sortedHint int // hint for pdqsort when choosing the pivot 35 36 type LessFunc[T any] func(a, b T) bool 37 38 func GenericLess[T types.OrderedT](a, b T) bool { 39 return a < b 40 } 41 42 func BoolLess(a, b bool) bool { return !a && b } 43 44 func Decimal64Less(a, b types.Decimal64) bool { return a.Lt(b) } 45 46 func Decimal128Less(a, b types.Decimal128) bool { return a.Lt(b) } 47 48 func UuidLess(a, b types.Uuid) bool { 49 return a.Lt(b) 50 } 51 52 // it seems that go has no const generic type, handle these types respectively 53 func TsLess(a, b types.TS) bool { return bytes.Compare(a[:], b[:]) < 0 } 54 func RowidLess(a, b types.Rowid) bool { return bytes.Compare(a[:], b[:]) < 0 } 55 func BlockidLess(a, b types.Blockid) bool { return bytes.Compare(a[:], b[:]) < 0 } 56 57 func Sort(desc, nullsLast, hasNull bool, os []int64, vec *vector.Vector, strCol []string) { 58 if hasNull { 59 sz := len(os) 60 if nullsLast { // move null rows to the tail 61 var cursor int 62 for cursor < sz && !nulls.Contains(vec.GetNulls(), uint64(os[cursor])) { 63 cursor++ 64 } 65 if cursor == sz { 66 return 67 } 68 for i := cursor; i < sz; i++ { 69 if !nulls.Contains(vec.GetNulls(), uint64(os[i])) { 70 os[cursor], os[i] = os[i], os[cursor] 71 cursor++ 72 } 73 } 74 os = os[:cursor] 75 } else { // move null rows to the head 76 var cursor int 77 for cursor < sz && nulls.Contains(vec.GetNulls(), uint64(os[cursor])) { 78 cursor++ 79 } 80 if cursor == sz { 81 return 82 } 83 for i := cursor; i < sz; i++ { 84 if nulls.Contains(vec.GetNulls(), uint64(os[i])) { 85 os[cursor], os[i] = os[i], os[cursor] 86 cursor++ 87 } 88 } 89 os = os[cursor:] 90 } 91 } 92 // sort only non-null rows 93 switch vec.GetType().Oid { 94 case types.T_bool: 95 col := vector.MustFixedCol[bool](vec) 96 if !desc { 97 genericSort(col, os, boolLess[bool]) 98 } else { 99 genericSort(col, os, boolGreater[bool]) 100 } 101 case types.T_bit: 102 col := vector.MustFixedCol[uint64](vec) 103 if !desc { 104 genericSort(col, os, genericLess[uint64]) 105 } else { 106 genericSort(col, os, genericGreater[uint64]) 107 } 108 case types.T_int8: 109 col := vector.MustFixedCol[int8](vec) 110 if !desc { 111 genericSort(col, os, genericLess[int8]) 112 } else { 113 genericSort(col, os, genericGreater[int8]) 114 } 115 case types.T_int16: 116 col := vector.MustFixedCol[int16](vec) 117 if !desc { 118 genericSort(col, os, genericLess[int16]) 119 } else { 120 genericSort(col, os, genericGreater[int16]) 121 } 122 case types.T_int32: 123 col := vector.MustFixedCol[int32](vec) 124 if !desc { 125 genericSort(col, os, genericLess[int32]) 126 } else { 127 genericSort(col, os, genericGreater[int32]) 128 } 129 case types.T_int64: 130 col := vector.MustFixedCol[int64](vec) 131 if !desc { 132 genericSort(col, os, genericLess[int64]) 133 } else { 134 genericSort(col, os, genericGreater[int64]) 135 } 136 case types.T_uint8: 137 col := vector.MustFixedCol[uint8](vec) 138 if !desc { 139 genericSort(col, os, genericLess[uint8]) 140 } else { 141 genericSort(col, os, genericGreater[uint8]) 142 } 143 case types.T_uint16: 144 col := vector.MustFixedCol[uint16](vec) 145 if !desc { 146 genericSort(col, os, genericLess[uint16]) 147 } else { 148 genericSort(col, os, genericGreater[uint16]) 149 } 150 case types.T_uint32: 151 col := vector.MustFixedCol[uint32](vec) 152 if !desc { 153 genericSort(col, os, genericLess[uint32]) 154 } else { 155 genericSort(col, os, genericGreater[uint32]) 156 } 157 case types.T_uint64: 158 col := vector.MustFixedCol[uint64](vec) 159 if !desc { 160 genericSort(col, os, genericLess[uint64]) 161 } else { 162 genericSort(col, os, genericGreater[uint64]) 163 } 164 case types.T_float32: 165 col := vector.MustFixedCol[float32](vec) 166 if !desc { 167 genericSort(col, os, genericLess[float32]) 168 } else { 169 genericSort(col, os, genericGreater[float32]) 170 } 171 case types.T_float64: 172 col := vector.MustFixedCol[float64](vec) 173 if !desc { 174 genericSort(col, os, genericLess[float64]) 175 } else { 176 genericSort(col, os, genericGreater[float64]) 177 } 178 case types.T_date: 179 col := vector.MustFixedCol[types.Date](vec) 180 if !desc { 181 genericSort(col, os, genericLess[types.Date]) 182 } else { 183 genericSort(col, os, genericGreater[types.Date]) 184 } 185 case types.T_datetime: 186 col := vector.MustFixedCol[types.Datetime](vec) 187 if !desc { 188 genericSort(col, os, genericLess[types.Datetime]) 189 } else { 190 genericSort(col, os, genericGreater[types.Datetime]) 191 } 192 case types.T_time: 193 col := vector.MustFixedCol[types.Time](vec) 194 if !desc { 195 genericSort(col, os, genericLess[types.Time]) 196 } else { 197 genericSort(col, os, genericGreater[types.Time]) 198 } 199 case types.T_timestamp: 200 col := vector.MustFixedCol[types.Timestamp](vec) 201 if !desc { 202 genericSort(col, os, genericLess[types.Timestamp]) 203 } else { 204 genericSort(col, os, genericGreater[types.Timestamp]) 205 } 206 case types.T_enum: 207 col := vector.MustFixedCol[types.Enum](vec) 208 if !desc { 209 genericSort(col, os, genericLess[types.Enum]) 210 } else { 211 genericSort(col, os, genericGreater[types.Enum]) 212 } 213 case types.T_decimal64: 214 col := vector.MustFixedCol[types.Decimal64](vec) 215 if !desc { 216 genericSort(col, os, decimal64Less) 217 } else { 218 genericSort(col, os, decimal64Greater) 219 } 220 case types.T_decimal128: 221 col := vector.MustFixedCol[types.Decimal128](vec) 222 if !desc { 223 genericSort(col, os, decimal128Less) 224 } else { 225 genericSort(col, os, decimal128Greater) 226 } 227 case types.T_uuid: 228 col := vector.MustFixedCol[types.Uuid](vec) 229 if !desc { 230 genericSort(col, os, uuidLess) 231 } else { 232 genericSort(col, os, uuidGreater) 233 } 234 case types.T_char, types.T_varchar, types.T_blob, types.T_text, types.T_binary, types.T_varbinary: 235 if strCol == nil { 236 strCol = vector.MustStrCol(vec) 237 } 238 if !desc { 239 genericSort(strCol, os, genericLess[string]) 240 } else { 241 genericSort(strCol, os, genericGreater[string]) 242 } 243 case types.T_array_float32: 244 col := vector.MustArrayCol[float32](vec) 245 if !desc { 246 genericSort(col, os, arrayLess[float32]) 247 } else { 248 genericSort(col, os, arrayGreater[float32]) 249 } 250 case types.T_array_float64: 251 col := vector.MustArrayCol[float64](vec) 252 if !desc { 253 genericSort(col, os, arrayLess[float64]) 254 } else { 255 genericSort(col, os, arrayGreater[float64]) 256 } 257 case types.T_TS: 258 col := vector.MustFixedCol[types.TS](vec) 259 if !desc { 260 genericSort(col, os, tsLess) 261 } else { 262 genericSort(col, os, tsGreater) 263 } 264 case types.T_Rowid: 265 col := vector.MustFixedCol[types.Rowid](vec) 266 if !desc { 267 genericSort(col, os, rowidLess) 268 } else { 269 genericSort(col, os, rowidGreater) 270 } 271 case types.T_Blockid: 272 col := vector.MustFixedCol[types.Blockid](vec) 273 if !desc { 274 genericSort(col, os, blockidLess) 275 } else { 276 genericSort(col, os, blockidGreater) 277 } 278 } 279 } 280 281 func boolLess[T bool](data []T, i, j int64) bool { 282 return bool(!data[i] && data[j]) 283 } 284 285 func boolGreater[T bool](data []T, i, j int64) bool { 286 return bool(data[i] && !data[j]) 287 } 288 289 func decimal64Less(data []types.Decimal64, i, j int64) bool { 290 return data[i].Compare(data[j]) < 0 291 } 292 293 func decimal64Greater(data []types.Decimal64, i, j int64) bool { 294 return data[i].Compare(data[j]) > 0 295 } 296 297 func decimal128Less(data []types.Decimal128, i, j int64) bool { 298 return data[i].Compare(data[j]) < 0 299 } 300 301 func decimal128Greater(data []types.Decimal128, i, j int64) bool { 302 return data[i].Compare(data[j]) > 0 303 } 304 305 func tsLess(data []types.TS, i, j int64) bool { 306 return data[i].Less(&data[j]) 307 } 308 309 func tsGreater(data []types.TS, i, j int64) bool { 310 return data[i].Greater(&data[j]) 311 } 312 313 func rowidLess(data []types.Rowid, i, j int64) bool { 314 return data[i].Less(data[j]) 315 } 316 317 func rowidGreater(data []types.Rowid, i, j int64) bool { 318 return data[i].Great(data[j]) 319 } 320 321 func blockidLess(data []types.Blockid, i, j int64) bool { 322 return data[i].Less(data[j]) 323 } 324 325 func blockidGreater(data []types.Blockid, i, j int64) bool { 326 return data[i].Great(data[j]) 327 } 328 329 func uuidLess(data []types.Uuid, i, j int64) bool { 330 return data[i].Compare(data[j]) < 0 331 } 332 333 func arrayLess[T types.RealNumbers](data [][]T, i, j int64) bool { 334 return moarray.Compare[T](data[i], data[j]) < 0 335 } 336 337 func uuidGreater(data []types.Uuid, i, j int64) bool { 338 return data[i].Compare(data[j]) > 0 339 } 340 341 func arrayGreater[T types.RealNumbers](data [][]T, i, j int64) bool { 342 return moarray.Compare[T](data[i], data[j]) > 0 343 } 344 345 func genericLess[T types.OrderedT](data []T, i, j int64) bool { 346 return data[i] < data[j] 347 } 348 349 func genericGreater[T types.OrderedT](data []T, i, j int64) bool { 350 return data[i] > data[j] 351 } 352 353 func (r *xorshift) Next() uint64 { 354 *r ^= *r << 13 355 *r ^= *r >> 17 356 *r ^= *r << 5 357 return uint64(*r) 358 } 359 360 func nextPowerOfTwo(length int) uint { 361 shift := uint(bits.Len(uint(length))) 362 return uint(1 << shift) 363 } 364 365 // Sort sorts data in ascending order as determined by the Less method. 366 // It makes one call to data.Len to determine n and O(n*log(n)) calls to 367 // data.Less and data.Swap. The sort is not guaranteed to be stable. 368 func genericSort[T any](data []T, os []int64, fn func([]T, int64, int64) bool) { 369 n := len(os) 370 if n <= 1 { 371 return 372 } 373 limit := bits.Len(uint(n)) 374 pdqsort(data, 0, n, limit, os, fn) 375 } 376 377 // pdqsort sorts data[a:b]. 378 // The algorithm based on pattern-defeating quicksort(pdqsort), but without the optimizations from BlockQuicksort. 379 // pdqsort paper: https://arxiv.org/pdf/2106.05123.pdf 380 // C++ implementation: https://github.com/orlp/pdqsort 381 // Rust implementation: https://docs.rs/pdqsort/latest/pdqsort/ 382 // limit is the number of allowed bad (very unbalanced) pivots before falling back to heapsort. 383 func pdqsort[T any](data []T, a, b, limit int, os []int64, fn func([]T, int64, int64) bool) { 384 const maxInsertion = 12 385 386 var ( 387 wasBalanced = true // whether the last partitioning was reasonably balanced 388 wasPartitioned = true // whether the slice was already partitioned 389 ) 390 391 for { 392 length := b - a 393 394 if length <= maxInsertion { 395 insertionSort(data, a, b, os, fn) 396 return 397 } 398 399 // Fall back to heapsort if too many bad choices were made. 400 if limit == 0 { 401 heapSort(data, a, b, os, fn) 402 return 403 } 404 405 // If the last partitioning was imbalanced, we need to breaking patterns. 406 if !wasBalanced { 407 breakPatterns(data, a, b, os) 408 limit-- 409 } 410 411 pivot, hint := choosePivot(data, a, b, os, fn) 412 if hint == decreasingHint { 413 reverseRange(data, a, b, os, fn) 414 // The chosen pivot was pivot-a elements after the start of the array. 415 // After reversing it is pivot-a elements before the end of the array. 416 // The idea came from Rust's implementation. 417 pivot = (b - 1) - (pivot - a) 418 hint = increasingHint 419 } 420 421 // The slice is likely already sorted. 422 if wasBalanced && wasPartitioned && hint == increasingHint { 423 if partialInsertionSort(data, a, b, os, fn) { 424 return 425 } 426 } 427 428 // Probably the slice contains many duplicate elements, partition the slice into 429 // elements equal to and elements greater than the pivot. 430 if a > 0 && !fn(data, os[a-1], os[pivot]) { 431 mid := partitionEqual(data, a, b, pivot, os, fn) 432 a = mid 433 continue 434 } 435 436 mid, alreadyPartitioned := partition(data, a, b, pivot, os, fn) 437 wasPartitioned = alreadyPartitioned 438 439 leftLen, rightLen := mid-a, b-mid 440 balanceThreshold := length / 8 441 if leftLen < rightLen { 442 wasBalanced = leftLen >= balanceThreshold 443 pdqsort(data, a, mid, limit, os, fn) 444 a = mid + 1 445 } else { 446 wasBalanced = rightLen >= balanceThreshold 447 pdqsort(data, mid+1, b, limit, os, fn) 448 b = mid 449 } 450 } 451 } 452 453 // insertionSort sorts data[a:b] using insertion sort. 454 func insertionSort[T any](data []T, a, b int, os []int64, fn func([]T, int64, int64) bool) { 455 for i := a + 1; i < b; i++ { 456 for j := i; j > a && fn(data, os[j], os[j-1]); j-- { 457 os[j], os[j-1] = os[j-1], os[j] 458 } 459 } 460 } 461 462 // siftDown implements the heap property on data[lo:hi]. 463 // first is an offset into the array where the root of the heap lies. 464 func siftDown[T any](data []T, lo, hi, first int, os []int64, fn func([]T, int64, int64) bool) { 465 root := lo 466 for { 467 child := 2*root + 1 468 if child >= hi { 469 break 470 } 471 if child+1 < hi && fn(data, os[first+child], os[first+child+1]) { 472 child++ 473 } 474 if !fn(data, os[first+root], os[first+child]) { 475 return 476 } 477 os[first+root], os[first+child] = os[first+child], os[first+root] 478 root = child 479 } 480 } 481 482 func heapSort[T any](data []T, a, b int, os []int64, fn func([]T, int64, int64) bool) { 483 first := a 484 lo := 0 485 hi := b - a 486 487 // Build heap with greatest element at top. 488 for i := (hi - 1) / 2; i >= 0; i-- { 489 siftDown(data, i, hi, first, os, fn) 490 } 491 492 // Pop elements, largest first, into end of data. 493 for i := hi - 1; i >= 0; i-- { 494 os[first], os[first+i] = os[first+i], os[first] 495 siftDown(data, lo, i, first, os, fn) 496 } 497 } 498 499 // partition does one quicksort partition. 500 // Let p = data[pivot] 501 // Moves elements in data[a:b] around, so that data[i]<p and data[j]>=p for i<newpivot and j>newpivot. 502 // On return, data[newpivot] = p 503 func partition[T any](data []T, a, b, pivot int, os []int64, fn func([]T, int64, int64) bool) (newpivot int, alreadyPartitioned bool) { 504 os[a], os[pivot] = os[pivot], os[a] 505 i, j := a+1, b-1 // i and j are inclusive of the elements remaining to be partitioned 506 507 for i <= j && fn(data, os[i], os[a]) { 508 i++ 509 } 510 for i <= j && !fn(data, os[j], os[a]) { 511 j-- 512 } 513 if i > j { 514 os[j], os[a] = os[a], os[j] 515 return j, true 516 } 517 os[i], os[j] = os[j], os[i] 518 i++ 519 j-- 520 521 for { 522 for i <= j && fn(data, os[i], os[a]) { 523 i++ 524 } 525 for i <= j && !fn(data, os[j], os[a]) { 526 j-- 527 } 528 if i > j { 529 break 530 } 531 os[i], os[j] = os[j], os[i] 532 i++ 533 j-- 534 } 535 os[j], os[a] = os[a], os[j] 536 return j, false 537 } 538 539 // partitionEqual partitions data[a:b] into elements equal to data[pivot] followed by elements greater than data[pivot]. 540 // It assumed that data[a:b] does not contain elements smaller than the data[pivot]. 541 func partitionEqual[T any](data []T, a, b, pivot int, os []int64, fn func([]T, int64, int64) bool) (newpivot int) { 542 os[a], os[pivot] = os[pivot], os[a] 543 i, j := a+1, b-1 // i and j are inclusive of the elements remaining to be partitioned 544 545 for { 546 for i <= j && !fn(data, os[a], os[i]) { 547 i++ 548 } 549 for i <= j && fn(data, os[a], os[j]) { 550 j-- 551 } 552 if i > j { 553 break 554 } 555 os[i], os[j] = os[j], os[i] 556 i++ 557 j-- 558 } 559 return i 560 } 561 562 // partialInsertionSort partially sorts a slice, returns true if the slice is sorted at the end. 563 func partialInsertionSort[T any](data []T, a, b int, os []int64, fn func([]T, int64, int64) bool) bool { 564 const ( 565 maxSteps = 5 // maximum number of adjacent out-of-order pairs that will get shifted 566 shortestShifting = 50 // don't shift any elements on short arrays 567 ) 568 i := a + 1 569 for j := 0; j < maxSteps; j++ { 570 for i < b && !fn(data, os[i], os[i-1]) { 571 i++ 572 } 573 574 if i == b { 575 return true 576 } 577 578 if b-a < shortestShifting { 579 return false 580 } 581 582 os[i], os[i-1] = os[i-1], os[i] 583 584 // Shift the smaller one to the left. 585 if i-a >= 2 { 586 for j := i - 1; j >= 1; j-- { 587 if !fn(data, os[j], os[j-1]) { 588 break 589 } 590 os[j], os[j-1] = os[j-1], os[j] 591 } 592 } 593 // Shift the greater one to the right. 594 if b-i >= 2 { 595 for j := i + 1; j < b; j++ { 596 if !fn(data, os[j], os[j-1]) { 597 break 598 } 599 os[j], os[j-1] = os[j-1], os[j] 600 } 601 } 602 } 603 return false 604 } 605 606 // breakPatterns scatters some elements around in an attempt to break some patterns 607 // that might cause imbalanced partitions in quicksort. 608 func breakPatterns[T any](data []T, a, b int, os []int64) { 609 length := b - a 610 if length >= 8 { 611 random := xorshift(length) 612 modulus := nextPowerOfTwo(length) 613 614 for idx := a + (length/4)*2 - 1; idx <= a+(length/4)*2+1; idx++ { 615 other := int(uint(random.Next()) & (modulus - 1)) 616 if other >= length { 617 other -= length 618 } 619 os[idx], os[a+other] = os[a+other], os[idx] 620 } 621 } 622 } 623 624 // choosePivot chooses a pivot in data[a:b]. 625 // 626 // [0,8): chooses a static pivot. 627 // [8,shortestNinther): uses the simple median-of-three method. 628 // [shortestNinther,∞): uses the Tukey ninther method. 629 func choosePivot[T any](data []T, a, b int, os []int64, fn func([]T, int64, int64) bool) (pivot int, hint sortedHint) { 630 const ( 631 shortestNinther = 50 632 maxSwaps = 4 * 3 633 ) 634 635 l := b - a 636 637 var ( 638 swaps int 639 i = a + l/4*1 640 j = a + l/4*2 641 k = a + l/4*3 642 ) 643 644 if l >= 8 { 645 if l >= shortestNinther { 646 // Tukey ninther method, the idea came from Rust's implementation. 647 i = medianAdjacent(data, i, &swaps, os, fn) 648 j = medianAdjacent(data, j, &swaps, os, fn) 649 k = medianAdjacent(data, k, &swaps, os, fn) 650 } 651 // Find the median among i, j, k and stores it into j. 652 j = median(data, i, j, k, &swaps, os, fn) 653 } 654 655 switch swaps { 656 case 0: 657 return j, increasingHint 658 case maxSwaps: 659 return j, decreasingHint 660 default: 661 return j, unknownHint 662 } 663 } 664 665 // order2 returns x,y where data[x] <= data[y], where x,y=a,b or x,y=b,a. 666 func order2[T any](data []T, a, b int, swaps *int, os []int64, fn func([]T, int64, int64) bool) (int, int) { 667 if fn(data, os[b], os[a]) { 668 *swaps++ 669 return b, a 670 } 671 return a, b 672 } 673 674 // median returns x where data[x] is the median of data[a],data[b],data[c], where x is a, b, or c. 675 func median[T any](data []T, a, b, c int, swaps *int, os []int64, fn func([]T, int64, int64) bool) int { 676 a, b = order2(data, a, b, swaps, os, fn) 677 b, _ = order2(data, b, c, swaps, os, fn) 678 _, b = order2(data, a, b, swaps, os, fn) 679 return b 680 } 681 682 // medianAdjacent finds the median of data[a - 1], data[a], data[a + 1] and stores the index into a. 683 func medianAdjacent[T any](data []T, a int, swaps *int, os []int64, fn func([]T, int64, int64) bool) int { 684 return median(data, a-1, a, a+1, swaps, os, fn) 685 } 686 687 func reverseRange[T any](data []T, a, b int, os []int64, fn func([]T, int64, int64) bool) { 688 i := a 689 j := b - 1 690 for i < j { 691 os[i], os[j] = os[j], os[i] 692 i++ 693 j-- 694 } 695 }