github.com/matrixorigin/matrixone@v0.7.0/pkg/sort/sort.go (about) 1 // Copyright 2021 Matrix Origin 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package sort 16 17 import ( 18 "math/bits" 19 20 "github.com/matrixorigin/matrixone/pkg/container/nulls" 21 "github.com/matrixorigin/matrixone/pkg/container/types" 22 "github.com/matrixorigin/matrixone/pkg/container/vector" 23 ) 24 25 const ( 26 unknownHint sortedHint = iota 27 increasingHint 28 decreasingHint 29 ) 30 31 type xorshift uint64 32 type sortedHint int // hint for pdqsort when choosing the pivot 33 34 func Sort(desc, nullsLast, hasNull bool, os []int64, vec *vector.Vector, strCol []string) { 35 if hasNull { 36 sz := len(os) 37 if nullsLast { // move null rows to the tail 38 var cursor int 39 for cursor < sz && !nulls.Contains(vec.Nsp, uint64(os[cursor])) { 40 cursor++ 41 } 42 if cursor == sz { 43 return 44 } 45 for i := cursor; i < sz; i++ { 46 if !nulls.Contains(vec.Nsp, uint64(os[i])) { 47 os[cursor], os[i] = os[i], os[cursor] 48 cursor++ 49 } 50 } 51 os = os[:cursor] 52 } else { // move null rows to the head 53 var cursor int 54 for cursor < sz && nulls.Contains(vec.Nsp, uint64(os[cursor])) { 55 cursor++ 56 } 57 if cursor == sz { 58 return 59 } 60 for i := cursor; i < sz; i++ { 61 if nulls.Contains(vec.Nsp, uint64(os[i])) { 62 os[cursor], os[i] = os[i], os[cursor] 63 cursor++ 64 } 65 } 66 os = os[cursor:] 67 } 68 } 69 // sort only non-null rows 70 switch vec.Typ.Oid { 71 case types.T_bool: 72 col := vector.GetFixedVectorValues[bool](vec) 73 if !desc { 74 genericSort(col, os, boolLess[bool]) 75 } else { 76 genericSort(col, os, boolGreater[bool]) 77 } 78 case types.T_int8: 79 col := vector.GetFixedVectorValues[int8](vec) 80 if !desc { 81 genericSort(col, os, genericLess[int8]) 82 } else { 83 genericSort(col, os, genericGreater[int8]) 84 } 85 case types.T_int16: 86 col := vector.GetFixedVectorValues[int16](vec) 87 if !desc { 88 genericSort(col, os, genericLess[int16]) 89 } else { 90 genericSort(col, os, genericGreater[int16]) 91 } 92 case types.T_int32: 93 col := vector.GetFixedVectorValues[int32](vec) 94 if !desc { 95 genericSort(col, os, genericLess[int32]) 96 } else { 97 genericSort(col, os, genericGreater[int32]) 98 } 99 case types.T_int64: 100 col := vector.GetFixedVectorValues[int64](vec) 101 if !desc { 102 genericSort(col, os, genericLess[int64]) 103 } else { 104 genericSort(col, os, genericGreater[int64]) 105 } 106 case types.T_uint8: 107 col := vector.GetFixedVectorValues[uint8](vec) 108 if !desc { 109 genericSort(col, os, genericLess[uint8]) 110 } else { 111 genericSort(col, os, genericGreater[uint8]) 112 } 113 case types.T_uint16: 114 col := vector.GetFixedVectorValues[uint16](vec) 115 if !desc { 116 genericSort(col, os, genericLess[uint16]) 117 } else { 118 genericSort(col, os, genericGreater[uint16]) 119 } 120 case types.T_uint32: 121 col := vector.GetFixedVectorValues[uint32](vec) 122 if !desc { 123 genericSort(col, os, genericLess[uint32]) 124 } else { 125 genericSort(col, os, genericGreater[uint32]) 126 } 127 case types.T_uint64: 128 col := vector.GetFixedVectorValues[uint64](vec) 129 if !desc { 130 genericSort(col, os, genericLess[uint64]) 131 } else { 132 genericSort(col, os, genericGreater[uint64]) 133 } 134 case types.T_float32: 135 col := vector.GetFixedVectorValues[float32](vec) 136 if !desc { 137 genericSort(col, os, genericLess[float32]) 138 } else { 139 genericSort(col, os, genericGreater[float32]) 140 } 141 case types.T_float64: 142 col := vector.GetFixedVectorValues[float64](vec) 143 if !desc { 144 genericSort(col, os, genericLess[float64]) 145 } else { 146 genericSort(col, os, genericGreater[float64]) 147 } 148 case types.T_date: 149 col := vector.GetFixedVectorValues[types.Date](vec) 150 if !desc { 151 genericSort(col, os, genericLess[types.Date]) 152 } else { 153 genericSort(col, os, genericGreater[types.Date]) 154 } 155 case types.T_datetime: 156 col := vector.GetFixedVectorValues[types.Datetime](vec) 157 if !desc { 158 genericSort(col, os, genericLess[types.Datetime]) 159 } else { 160 genericSort(col, os, genericGreater[types.Datetime]) 161 } 162 case types.T_time: 163 col := vector.GetFixedVectorValues[types.Time](vec) 164 if !desc { 165 genericSort(col, os, genericLess[types.Time]) 166 } else { 167 genericSort(col, os, genericGreater[types.Time]) 168 } 169 case types.T_timestamp: 170 col := vector.GetFixedVectorValues[types.Timestamp](vec) 171 if !desc { 172 genericSort(col, os, genericLess[types.Timestamp]) 173 } else { 174 genericSort(col, os, genericGreater[types.Timestamp]) 175 } 176 case types.T_decimal64: 177 col := vector.GetFixedVectorValues[types.Decimal64](vec) 178 if !desc { 179 genericSort(col, os, decimal64Less) 180 } else { 181 genericSort(col, os, decimal64Greater) 182 } 183 case types.T_decimal128: 184 col := vector.GetFixedVectorValues[types.Decimal128](vec) 185 if !desc { 186 genericSort(col, os, decimal128Less) 187 } else { 188 genericSort(col, os, decimal128Greater) 189 } 190 case types.T_uuid: 191 col := vector.GetFixedVectorValues[types.Uuid](vec) 192 if !desc { 193 genericSort(col, os, uuidLess) 194 } else { 195 genericSort(col, os, uuidGreater) 196 } 197 case types.T_char, types.T_varchar, types.T_blob, types.T_text: 198 if strCol == nil { 199 strCol = vector.GetStrVectorValues(vec) 200 } 201 if !desc { 202 genericSort(strCol, os, genericLess[string]) 203 } else { 204 genericSort(strCol, os, genericGreater[string]) 205 } 206 } 207 } 208 209 func boolLess[T bool](data []T, i, j int64) bool { 210 return bool(!data[i] && data[j]) 211 } 212 213 func boolGreater[T bool](data []T, i, j int64) bool { 214 return bool(data[i] && !data[j]) 215 } 216 217 func decimal64Less(data []types.Decimal64, i, j int64) bool { 218 return data[i].Compare(data[j]) < 0 219 } 220 221 func decimal64Greater(data []types.Decimal64, i, j int64) bool { 222 return data[i].Compare(data[j]) > 0 223 } 224 225 func decimal128Less(data []types.Decimal128, i, j int64) bool { 226 return data[i].Compare(data[j]) < 0 227 } 228 229 func decimal128Greater(data []types.Decimal128, i, j int64) bool { 230 return data[i].Compare(data[j]) > 0 231 } 232 233 func uuidLess(data []types.Uuid, i, j int64) bool { 234 return data[i].Compare(data[j]) < 0 235 } 236 237 func uuidGreater(data []types.Uuid, i, j int64) bool { 238 return data[i].Compare(data[j]) > 0 239 } 240 241 func genericLess[T types.OrderedT](data []T, i, j int64) bool { 242 return data[i] < data[j] 243 } 244 245 func genericGreater[T types.OrderedT](data []T, i, j int64) bool { 246 return data[i] > data[j] 247 } 248 249 func (r *xorshift) Next() uint64 { 250 *r ^= *r << 13 251 *r ^= *r >> 17 252 *r ^= *r << 5 253 return uint64(*r) 254 } 255 256 func nextPowerOfTwo(length int) uint { 257 shift := uint(bits.Len(uint(length))) 258 return uint(1 << shift) 259 } 260 261 // Sort sorts data in ascending order as determined by the Less method. 262 // It makes one call to data.Len to determine n and O(n*log(n)) calls to 263 // data.Less and data.Swap. The sort is not guaranteed to be stable. 264 func genericSort[T any](data []T, os []int64, fn func([]T, int64, int64) bool) { 265 n := len(os) 266 if n <= 1 { 267 return 268 } 269 limit := bits.Len(uint(n)) 270 pdqsort(data, 0, n, limit, os, fn) 271 } 272 273 // pdqsort sorts data[a:b]. 274 // The algorithm based on pattern-defeating quicksort(pdqsort), but without the optimizations from BlockQuicksort. 275 // pdqsort paper: https://arxiv.org/pdf/2106.05123.pdf 276 // C++ implementation: https://github.com/orlp/pdqsort 277 // Rust implementation: https://docs.rs/pdqsort/latest/pdqsort/ 278 // limit is the number of allowed bad (very unbalanced) pivots before falling back to heapsort. 279 func pdqsort[T any](data []T, a, b, limit int, os []int64, fn func([]T, int64, int64) bool) { 280 const maxInsertion = 12 281 282 var ( 283 wasBalanced = true // whether the last partitioning was reasonably balanced 284 wasPartitioned = true // whether the slice was already partitioned 285 ) 286 287 for { 288 length := b - a 289 290 if length <= maxInsertion { 291 insertionSort(data, a, b, os, fn) 292 return 293 } 294 295 // Fall back to heapsort if too many bad choices were made. 296 if limit == 0 { 297 heapSort(data, a, b, os, fn) 298 return 299 } 300 301 // If the last partitioning was imbalanced, we need to breaking patterns. 302 if !wasBalanced { 303 breakPatterns(data, a, b, os) 304 limit-- 305 } 306 307 pivot, hint := choosePivot(data, a, b, os, fn) 308 if hint == decreasingHint { 309 reverseRange(data, a, b, os, fn) 310 // The chosen pivot was pivot-a elements after the start of the array. 311 // After reversing it is pivot-a elements before the end of the array. 312 // The idea came from Rust's implementation. 313 pivot = (b - 1) - (pivot - a) 314 hint = increasingHint 315 } 316 317 // The slice is likely already sorted. 318 if wasBalanced && wasPartitioned && hint == increasingHint { 319 if partialInsertionSort(data, a, b, os, fn) { 320 return 321 } 322 } 323 324 // Probably the slice contains many duplicate elements, partition the slice into 325 // elements equal to and elements greater than the pivot. 326 if a > 0 && !fn(data, os[a-1], os[pivot]) { 327 mid := partitionEqual(data, a, b, pivot, os, fn) 328 a = mid 329 continue 330 } 331 332 mid, alreadyPartitioned := partition(data, a, b, pivot, os, fn) 333 wasPartitioned = alreadyPartitioned 334 335 leftLen, rightLen := mid-a, b-mid 336 balanceThreshold := length / 8 337 if leftLen < rightLen { 338 wasBalanced = leftLen >= balanceThreshold 339 pdqsort(data, a, mid, limit, os, fn) 340 a = mid + 1 341 } else { 342 wasBalanced = rightLen >= balanceThreshold 343 pdqsort(data, mid+1, b, limit, os, fn) 344 b = mid 345 } 346 } 347 } 348 349 // insertionSort sorts data[a:b] using insertion sort. 350 func insertionSort[T any](data []T, a, b int, os []int64, fn func([]T, int64, int64) bool) { 351 for i := a + 1; i < b; i++ { 352 for j := i; j > a && fn(data, os[j], os[j-1]); j-- { 353 os[j], os[j-1] = os[j-1], os[j] 354 } 355 } 356 } 357 358 // siftDown implements the heap property on data[lo:hi]. 359 // first is an offset into the array where the root of the heap lies. 360 func siftDown[T any](data []T, lo, hi, first int, os []int64, fn func([]T, int64, int64) bool) { 361 root := lo 362 for { 363 child := 2*root + 1 364 if child >= hi { 365 break 366 } 367 if child+1 < hi && fn(data, os[first+child], os[first+child+1]) { 368 child++ 369 } 370 if !fn(data, os[first+root], os[first+child]) { 371 return 372 } 373 os[first+root], os[first+child] = os[first+child], os[first+root] 374 root = child 375 } 376 } 377 378 func heapSort[T any](data []T, a, b int, os []int64, fn func([]T, int64, int64) bool) { 379 first := a 380 lo := 0 381 hi := b - a 382 383 // Build heap with greatest element at top. 384 for i := (hi - 1) / 2; i >= 0; i-- { 385 siftDown(data, i, hi, first, os, fn) 386 } 387 388 // Pop elements, largest first, into end of data. 389 for i := hi - 1; i >= 0; i-- { 390 os[first], os[first+i] = os[first+i], os[first] 391 siftDown(data, lo, i, first, os, fn) 392 } 393 } 394 395 // partition does one quicksort partition. 396 // Let p = data[pivot] 397 // Moves elements in data[a:b] around, so that data[i]<p and data[j]>=p for i<newpivot and j>newpivot. 398 // On return, data[newpivot] = p 399 func partition[T any](data []T, a, b, pivot int, os []int64, fn func([]T, int64, int64) bool) (newpivot int, alreadyPartitioned bool) { 400 os[a], os[pivot] = os[pivot], os[a] 401 i, j := a+1, b-1 // i and j are inclusive of the elements remaining to be partitioned 402 403 for i <= j && fn(data, os[i], os[a]) { 404 i++ 405 } 406 for i <= j && !fn(data, os[j], os[a]) { 407 j-- 408 } 409 if i > j { 410 os[j], os[a] = os[a], os[j] 411 return j, true 412 } 413 os[i], os[j] = os[j], os[i] 414 i++ 415 j-- 416 417 for { 418 for i <= j && fn(data, os[i], os[a]) { 419 i++ 420 } 421 for i <= j && !fn(data, os[j], os[a]) { 422 j-- 423 } 424 if i > j { 425 break 426 } 427 os[i], os[j] = os[j], os[i] 428 i++ 429 j-- 430 } 431 os[j], os[a] = os[a], os[j] 432 return j, false 433 } 434 435 // partitionEqual partitions data[a:b] into elements equal to data[pivot] followed by elements greater than data[pivot]. 436 // It assumed that data[a:b] does not contain elements smaller than the data[pivot]. 437 func partitionEqual[T any](data []T, a, b, pivot int, os []int64, fn func([]T, int64, int64) bool) (newpivot int) { 438 os[a], os[pivot] = os[pivot], os[a] 439 i, j := a+1, b-1 // i and j are inclusive of the elements remaining to be partitioned 440 441 for { 442 for i <= j && !fn(data, os[a], os[i]) { 443 i++ 444 } 445 for i <= j && fn(data, os[a], os[j]) { 446 j-- 447 } 448 if i > j { 449 break 450 } 451 os[i], os[j] = os[j], os[i] 452 i++ 453 j-- 454 } 455 return i 456 } 457 458 // partialInsertionSort partially sorts a slice, returns true if the slice is sorted at the end. 459 func partialInsertionSort[T any](data []T, a, b int, os []int64, fn func([]T, int64, int64) bool) bool { 460 const ( 461 maxSteps = 5 // maximum number of adjacent out-of-order pairs that will get shifted 462 shortestShifting = 50 // don't shift any elements on short arrays 463 ) 464 i := a + 1 465 for j := 0; j < maxSteps; j++ { 466 for i < b && !fn(data, os[i], os[i-1]) { 467 i++ 468 } 469 470 if i == b { 471 return true 472 } 473 474 if b-a < shortestShifting { 475 return false 476 } 477 478 os[i], os[i-1] = os[i-1], os[i] 479 480 // Shift the smaller one to the left. 481 if i-a >= 2 { 482 for j := i - 1; j >= 1; j-- { 483 if !fn(data, os[j], os[j-1]) { 484 break 485 } 486 os[j], os[j-1] = os[j-1], os[j] 487 } 488 } 489 // Shift the greater one to the right. 490 if b-i >= 2 { 491 for j := i + 1; j < b; j++ { 492 if !fn(data, os[j], os[j-1]) { 493 break 494 } 495 os[j], os[j-1] = os[j-1], os[j] 496 } 497 } 498 } 499 return false 500 } 501 502 // breakPatterns scatters some elements around in an attempt to break some patterns 503 // that might cause imbalanced partitions in quicksort. 504 func breakPatterns[T any](data []T, a, b int, os []int64) { 505 length := b - a 506 if length >= 8 { 507 random := xorshift(length) 508 modulus := nextPowerOfTwo(length) 509 510 for idx := a + (length/4)*2 - 1; idx <= a+(length/4)*2+1; idx++ { 511 other := int(uint(random.Next()) & (modulus - 1)) 512 if other >= length { 513 other -= length 514 } 515 os[idx], os[a+other] = os[a+other], os[idx] 516 } 517 } 518 } 519 520 // choosePivot chooses a pivot in data[a:b]. 521 // 522 // [0,8): chooses a static pivot. 523 // [8,shortestNinther): uses the simple median-of-three method. 524 // [shortestNinther,∞): uses the Tukey ninther method. 525 func choosePivot[T any](data []T, a, b int, os []int64, fn func([]T, int64, int64) bool) (pivot int, hint sortedHint) { 526 const ( 527 shortestNinther = 50 528 maxSwaps = 4 * 3 529 ) 530 531 l := b - a 532 533 var ( 534 swaps int 535 i = a + l/4*1 536 j = a + l/4*2 537 k = a + l/4*3 538 ) 539 540 if l >= 8 { 541 if l >= shortestNinther { 542 // Tukey ninther method, the idea came from Rust's implementation. 543 i = medianAdjacent(data, i, &swaps, os, fn) 544 j = medianAdjacent(data, j, &swaps, os, fn) 545 k = medianAdjacent(data, k, &swaps, os, fn) 546 } 547 // Find the median among i, j, k and stores it into j. 548 j = median(data, i, j, k, &swaps, os, fn) 549 } 550 551 switch swaps { 552 case 0: 553 return j, increasingHint 554 case maxSwaps: 555 return j, decreasingHint 556 default: 557 return j, unknownHint 558 } 559 } 560 561 // order2 returns x,y where data[x] <= data[y], where x,y=a,b or x,y=b,a. 562 func order2[T any](data []T, a, b int, swaps *int, os []int64, fn func([]T, int64, int64) bool) (int, int) { 563 if fn(data, os[b], os[a]) { 564 *swaps++ 565 return b, a 566 } 567 return a, b 568 } 569 570 // median returns x where data[x] is the median of data[a],data[b],data[c], where x is a, b, or c. 571 func median[T any](data []T, a, b, c int, swaps *int, os []int64, fn func([]T, int64, int64) bool) int { 572 a, b = order2(data, a, b, swaps, os, fn) 573 b, _ = order2(data, b, c, swaps, os, fn) 574 _, b = order2(data, a, b, swaps, os, fn) 575 return b 576 } 577 578 // medianAdjacent finds the median of data[a - 1], data[a], data[a + 1] and stores the index into a. 579 func medianAdjacent[T any](data []T, a int, swaps *int, os []int64, fn func([]T, int64, int64) bool) int { 580 return median(data, a-1, a, a+1, swaps, os, fn) 581 } 582 583 func reverseRange[T any](data []T, a, b int, os []int64, fn func([]T, int64, int64) bool) { 584 i := a 585 j := b - 1 586 for i < j { 587 os[i], os[j] = os[j], os[i] 588 i++ 589 j-- 590 } 591 }