github.com/matrixorigin/matrixone@v0.7.0/pkg/sort/sort.go (about)

     1  // Copyright 2021 Matrix Origin
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package sort
    16  
    17  import (
    18  	"math/bits"
    19  
    20  	"github.com/matrixorigin/matrixone/pkg/container/nulls"
    21  	"github.com/matrixorigin/matrixone/pkg/container/types"
    22  	"github.com/matrixorigin/matrixone/pkg/container/vector"
    23  )
    24  
    25  const (
    26  	unknownHint sortedHint = iota
    27  	increasingHint
    28  	decreasingHint
    29  )
    30  
    31  type xorshift uint64
    32  type sortedHint int // hint for pdqsort when choosing the pivot
    33  
    34  func Sort(desc, nullsLast, hasNull bool, os []int64, vec *vector.Vector, strCol []string) {
    35  	if hasNull {
    36  		sz := len(os)
    37  		if nullsLast { // move null rows to the tail
    38  			var cursor int
    39  			for cursor < sz && !nulls.Contains(vec.Nsp, uint64(os[cursor])) {
    40  				cursor++
    41  			}
    42  			if cursor == sz {
    43  				return
    44  			}
    45  			for i := cursor; i < sz; i++ {
    46  				if !nulls.Contains(vec.Nsp, uint64(os[i])) {
    47  					os[cursor], os[i] = os[i], os[cursor]
    48  					cursor++
    49  				}
    50  			}
    51  			os = os[:cursor]
    52  		} else { // move null rows to the head
    53  			var cursor int
    54  			for cursor < sz && nulls.Contains(vec.Nsp, uint64(os[cursor])) {
    55  				cursor++
    56  			}
    57  			if cursor == sz {
    58  				return
    59  			}
    60  			for i := cursor; i < sz; i++ {
    61  				if nulls.Contains(vec.Nsp, uint64(os[i])) {
    62  					os[cursor], os[i] = os[i], os[cursor]
    63  					cursor++
    64  				}
    65  			}
    66  			os = os[cursor:]
    67  		}
    68  	}
    69  	// sort only non-null rows
    70  	switch vec.Typ.Oid {
    71  	case types.T_bool:
    72  		col := vector.GetFixedVectorValues[bool](vec)
    73  		if !desc {
    74  			genericSort(col, os, boolLess[bool])
    75  		} else {
    76  			genericSort(col, os, boolGreater[bool])
    77  		}
    78  	case types.T_int8:
    79  		col := vector.GetFixedVectorValues[int8](vec)
    80  		if !desc {
    81  			genericSort(col, os, genericLess[int8])
    82  		} else {
    83  			genericSort(col, os, genericGreater[int8])
    84  		}
    85  	case types.T_int16:
    86  		col := vector.GetFixedVectorValues[int16](vec)
    87  		if !desc {
    88  			genericSort(col, os, genericLess[int16])
    89  		} else {
    90  			genericSort(col, os, genericGreater[int16])
    91  		}
    92  	case types.T_int32:
    93  		col := vector.GetFixedVectorValues[int32](vec)
    94  		if !desc {
    95  			genericSort(col, os, genericLess[int32])
    96  		} else {
    97  			genericSort(col, os, genericGreater[int32])
    98  		}
    99  	case types.T_int64:
   100  		col := vector.GetFixedVectorValues[int64](vec)
   101  		if !desc {
   102  			genericSort(col, os, genericLess[int64])
   103  		} else {
   104  			genericSort(col, os, genericGreater[int64])
   105  		}
   106  	case types.T_uint8:
   107  		col := vector.GetFixedVectorValues[uint8](vec)
   108  		if !desc {
   109  			genericSort(col, os, genericLess[uint8])
   110  		} else {
   111  			genericSort(col, os, genericGreater[uint8])
   112  		}
   113  	case types.T_uint16:
   114  		col := vector.GetFixedVectorValues[uint16](vec)
   115  		if !desc {
   116  			genericSort(col, os, genericLess[uint16])
   117  		} else {
   118  			genericSort(col, os, genericGreater[uint16])
   119  		}
   120  	case types.T_uint32:
   121  		col := vector.GetFixedVectorValues[uint32](vec)
   122  		if !desc {
   123  			genericSort(col, os, genericLess[uint32])
   124  		} else {
   125  			genericSort(col, os, genericGreater[uint32])
   126  		}
   127  	case types.T_uint64:
   128  		col := vector.GetFixedVectorValues[uint64](vec)
   129  		if !desc {
   130  			genericSort(col, os, genericLess[uint64])
   131  		} else {
   132  			genericSort(col, os, genericGreater[uint64])
   133  		}
   134  	case types.T_float32:
   135  		col := vector.GetFixedVectorValues[float32](vec)
   136  		if !desc {
   137  			genericSort(col, os, genericLess[float32])
   138  		} else {
   139  			genericSort(col, os, genericGreater[float32])
   140  		}
   141  	case types.T_float64:
   142  		col := vector.GetFixedVectorValues[float64](vec)
   143  		if !desc {
   144  			genericSort(col, os, genericLess[float64])
   145  		} else {
   146  			genericSort(col, os, genericGreater[float64])
   147  		}
   148  	case types.T_date:
   149  		col := vector.GetFixedVectorValues[types.Date](vec)
   150  		if !desc {
   151  			genericSort(col, os, genericLess[types.Date])
   152  		} else {
   153  			genericSort(col, os, genericGreater[types.Date])
   154  		}
   155  	case types.T_datetime:
   156  		col := vector.GetFixedVectorValues[types.Datetime](vec)
   157  		if !desc {
   158  			genericSort(col, os, genericLess[types.Datetime])
   159  		} else {
   160  			genericSort(col, os, genericGreater[types.Datetime])
   161  		}
   162  	case types.T_time:
   163  		col := vector.GetFixedVectorValues[types.Time](vec)
   164  		if !desc {
   165  			genericSort(col, os, genericLess[types.Time])
   166  		} else {
   167  			genericSort(col, os, genericGreater[types.Time])
   168  		}
   169  	case types.T_timestamp:
   170  		col := vector.GetFixedVectorValues[types.Timestamp](vec)
   171  		if !desc {
   172  			genericSort(col, os, genericLess[types.Timestamp])
   173  		} else {
   174  			genericSort(col, os, genericGreater[types.Timestamp])
   175  		}
   176  	case types.T_decimal64:
   177  		col := vector.GetFixedVectorValues[types.Decimal64](vec)
   178  		if !desc {
   179  			genericSort(col, os, decimal64Less)
   180  		} else {
   181  			genericSort(col, os, decimal64Greater)
   182  		}
   183  	case types.T_decimal128:
   184  		col := vector.GetFixedVectorValues[types.Decimal128](vec)
   185  		if !desc {
   186  			genericSort(col, os, decimal128Less)
   187  		} else {
   188  			genericSort(col, os, decimal128Greater)
   189  		}
   190  	case types.T_uuid:
   191  		col := vector.GetFixedVectorValues[types.Uuid](vec)
   192  		if !desc {
   193  			genericSort(col, os, uuidLess)
   194  		} else {
   195  			genericSort(col, os, uuidGreater)
   196  		}
   197  	case types.T_char, types.T_varchar, types.T_blob, types.T_text:
   198  		if strCol == nil {
   199  			strCol = vector.GetStrVectorValues(vec)
   200  		}
   201  		if !desc {
   202  			genericSort(strCol, os, genericLess[string])
   203  		} else {
   204  			genericSort(strCol, os, genericGreater[string])
   205  		}
   206  	}
   207  }
   208  
   209  func boolLess[T bool](data []T, i, j int64) bool {
   210  	return bool(!data[i] && data[j])
   211  }
   212  
   213  func boolGreater[T bool](data []T, i, j int64) bool {
   214  	return bool(data[i] && !data[j])
   215  }
   216  
   217  func decimal64Less(data []types.Decimal64, i, j int64) bool {
   218  	return data[i].Compare(data[j]) < 0
   219  }
   220  
   221  func decimal64Greater(data []types.Decimal64, i, j int64) bool {
   222  	return data[i].Compare(data[j]) > 0
   223  }
   224  
   225  func decimal128Less(data []types.Decimal128, i, j int64) bool {
   226  	return data[i].Compare(data[j]) < 0
   227  }
   228  
   229  func decimal128Greater(data []types.Decimal128, i, j int64) bool {
   230  	return data[i].Compare(data[j]) > 0
   231  }
   232  
   233  func uuidLess(data []types.Uuid, i, j int64) bool {
   234  	return data[i].Compare(data[j]) < 0
   235  }
   236  
   237  func uuidGreater(data []types.Uuid, i, j int64) bool {
   238  	return data[i].Compare(data[j]) > 0
   239  }
   240  
   241  func genericLess[T types.OrderedT](data []T, i, j int64) bool {
   242  	return data[i] < data[j]
   243  }
   244  
   245  func genericGreater[T types.OrderedT](data []T, i, j int64) bool {
   246  	return data[i] > data[j]
   247  }
   248  
   249  func (r *xorshift) Next() uint64 {
   250  	*r ^= *r << 13
   251  	*r ^= *r >> 17
   252  	*r ^= *r << 5
   253  	return uint64(*r)
   254  }
   255  
   256  func nextPowerOfTwo(length int) uint {
   257  	shift := uint(bits.Len(uint(length)))
   258  	return uint(1 << shift)
   259  }
   260  
   261  // Sort sorts data in ascending order as determined by the Less method.
   262  // It makes one call to data.Len to determine n and O(n*log(n)) calls to
   263  // data.Less and data.Swap. The sort is not guaranteed to be stable.
   264  func genericSort[T any](data []T, os []int64, fn func([]T, int64, int64) bool) {
   265  	n := len(os)
   266  	if n <= 1 {
   267  		return
   268  	}
   269  	limit := bits.Len(uint(n))
   270  	pdqsort(data, 0, n, limit, os, fn)
   271  }
   272  
   273  // pdqsort sorts data[a:b].
   274  // The algorithm based on pattern-defeating quicksort(pdqsort), but without the optimizations from BlockQuicksort.
   275  // pdqsort paper: https://arxiv.org/pdf/2106.05123.pdf
   276  // C++ implementation: https://github.com/orlp/pdqsort
   277  // Rust implementation: https://docs.rs/pdqsort/latest/pdqsort/
   278  // limit is the number of allowed bad (very unbalanced) pivots before falling back to heapsort.
   279  func pdqsort[T any](data []T, a, b, limit int, os []int64, fn func([]T, int64, int64) bool) {
   280  	const maxInsertion = 12
   281  
   282  	var (
   283  		wasBalanced    = true // whether the last partitioning was reasonably balanced
   284  		wasPartitioned = true // whether the slice was already partitioned
   285  	)
   286  
   287  	for {
   288  		length := b - a
   289  
   290  		if length <= maxInsertion {
   291  			insertionSort(data, a, b, os, fn)
   292  			return
   293  		}
   294  
   295  		// Fall back to heapsort if too many bad choices were made.
   296  		if limit == 0 {
   297  			heapSort(data, a, b, os, fn)
   298  			return
   299  		}
   300  
   301  		// If the last partitioning was imbalanced, we need to breaking patterns.
   302  		if !wasBalanced {
   303  			breakPatterns(data, a, b, os)
   304  			limit--
   305  		}
   306  
   307  		pivot, hint := choosePivot(data, a, b, os, fn)
   308  		if hint == decreasingHint {
   309  			reverseRange(data, a, b, os, fn)
   310  			// The chosen pivot was pivot-a elements after the start of the array.
   311  			// After reversing it is pivot-a elements before the end of the array.
   312  			// The idea came from Rust's implementation.
   313  			pivot = (b - 1) - (pivot - a)
   314  			hint = increasingHint
   315  		}
   316  
   317  		// The slice is likely already sorted.
   318  		if wasBalanced && wasPartitioned && hint == increasingHint {
   319  			if partialInsertionSort(data, a, b, os, fn) {
   320  				return
   321  			}
   322  		}
   323  
   324  		// Probably the slice contains many duplicate elements, partition the slice into
   325  		// elements equal to and elements greater than the pivot.
   326  		if a > 0 && !fn(data, os[a-1], os[pivot]) {
   327  			mid := partitionEqual(data, a, b, pivot, os, fn)
   328  			a = mid
   329  			continue
   330  		}
   331  
   332  		mid, alreadyPartitioned := partition(data, a, b, pivot, os, fn)
   333  		wasPartitioned = alreadyPartitioned
   334  
   335  		leftLen, rightLen := mid-a, b-mid
   336  		balanceThreshold := length / 8
   337  		if leftLen < rightLen {
   338  			wasBalanced = leftLen >= balanceThreshold
   339  			pdqsort(data, a, mid, limit, os, fn)
   340  			a = mid + 1
   341  		} else {
   342  			wasBalanced = rightLen >= balanceThreshold
   343  			pdqsort(data, mid+1, b, limit, os, fn)
   344  			b = mid
   345  		}
   346  	}
   347  }
   348  
   349  // insertionSort sorts data[a:b] using insertion sort.
   350  func insertionSort[T any](data []T, a, b int, os []int64, fn func([]T, int64, int64) bool) {
   351  	for i := a + 1; i < b; i++ {
   352  		for j := i; j > a && fn(data, os[j], os[j-1]); j-- {
   353  			os[j], os[j-1] = os[j-1], os[j]
   354  		}
   355  	}
   356  }
   357  
   358  // siftDown implements the heap property on data[lo:hi].
   359  // first is an offset into the array where the root of the heap lies.
   360  func siftDown[T any](data []T, lo, hi, first int, os []int64, fn func([]T, int64, int64) bool) {
   361  	root := lo
   362  	for {
   363  		child := 2*root + 1
   364  		if child >= hi {
   365  			break
   366  		}
   367  		if child+1 < hi && fn(data, os[first+child], os[first+child+1]) {
   368  			child++
   369  		}
   370  		if !fn(data, os[first+root], os[first+child]) {
   371  			return
   372  		}
   373  		os[first+root], os[first+child] = os[first+child], os[first+root]
   374  		root = child
   375  	}
   376  }
   377  
   378  func heapSort[T any](data []T, a, b int, os []int64, fn func([]T, int64, int64) bool) {
   379  	first := a
   380  	lo := 0
   381  	hi := b - a
   382  
   383  	// Build heap with greatest element at top.
   384  	for i := (hi - 1) / 2; i >= 0; i-- {
   385  		siftDown(data, i, hi, first, os, fn)
   386  	}
   387  
   388  	// Pop elements, largest first, into end of data.
   389  	for i := hi - 1; i >= 0; i-- {
   390  		os[first], os[first+i] = os[first+i], os[first]
   391  		siftDown(data, lo, i, first, os, fn)
   392  	}
   393  }
   394  
   395  // partition does one quicksort partition.
   396  // Let p = data[pivot]
   397  // Moves elements in data[a:b] around, so that data[i]<p and data[j]>=p for i<newpivot and j>newpivot.
   398  // On return, data[newpivot] = p
   399  func partition[T any](data []T, a, b, pivot int, os []int64, fn func([]T, int64, int64) bool) (newpivot int, alreadyPartitioned bool) {
   400  	os[a], os[pivot] = os[pivot], os[a]
   401  	i, j := a+1, b-1 // i and j are inclusive of the elements remaining to be partitioned
   402  
   403  	for i <= j && fn(data, os[i], os[a]) {
   404  		i++
   405  	}
   406  	for i <= j && !fn(data, os[j], os[a]) {
   407  		j--
   408  	}
   409  	if i > j {
   410  		os[j], os[a] = os[a], os[j]
   411  		return j, true
   412  	}
   413  	os[i], os[j] = os[j], os[i]
   414  	i++
   415  	j--
   416  
   417  	for {
   418  		for i <= j && fn(data, os[i], os[a]) {
   419  			i++
   420  		}
   421  		for i <= j && !fn(data, os[j], os[a]) {
   422  			j--
   423  		}
   424  		if i > j {
   425  			break
   426  		}
   427  		os[i], os[j] = os[j], os[i]
   428  		i++
   429  		j--
   430  	}
   431  	os[j], os[a] = os[a], os[j]
   432  	return j, false
   433  }
   434  
   435  // partitionEqual partitions data[a:b] into elements equal to data[pivot] followed by elements greater than data[pivot].
   436  // It assumed that data[a:b] does not contain elements smaller than the data[pivot].
   437  func partitionEqual[T any](data []T, a, b, pivot int, os []int64, fn func([]T, int64, int64) bool) (newpivot int) {
   438  	os[a], os[pivot] = os[pivot], os[a]
   439  	i, j := a+1, b-1 // i and j are inclusive of the elements remaining to be partitioned
   440  
   441  	for {
   442  		for i <= j && !fn(data, os[a], os[i]) {
   443  			i++
   444  		}
   445  		for i <= j && fn(data, os[a], os[j]) {
   446  			j--
   447  		}
   448  		if i > j {
   449  			break
   450  		}
   451  		os[i], os[j] = os[j], os[i]
   452  		i++
   453  		j--
   454  	}
   455  	return i
   456  }
   457  
   458  // partialInsertionSort partially sorts a slice, returns true if the slice is sorted at the end.
   459  func partialInsertionSort[T any](data []T, a, b int, os []int64, fn func([]T, int64, int64) bool) bool {
   460  	const (
   461  		maxSteps         = 5  // maximum number of adjacent out-of-order pairs that will get shifted
   462  		shortestShifting = 50 // don't shift any elements on short arrays
   463  	)
   464  	i := a + 1
   465  	for j := 0; j < maxSteps; j++ {
   466  		for i < b && !fn(data, os[i], os[i-1]) {
   467  			i++
   468  		}
   469  
   470  		if i == b {
   471  			return true
   472  		}
   473  
   474  		if b-a < shortestShifting {
   475  			return false
   476  		}
   477  
   478  		os[i], os[i-1] = os[i-1], os[i]
   479  
   480  		// Shift the smaller one to the left.
   481  		if i-a >= 2 {
   482  			for j := i - 1; j >= 1; j-- {
   483  				if !fn(data, os[j], os[j-1]) {
   484  					break
   485  				}
   486  				os[j], os[j-1] = os[j-1], os[j]
   487  			}
   488  		}
   489  		// Shift the greater one to the right.
   490  		if b-i >= 2 {
   491  			for j := i + 1; j < b; j++ {
   492  				if !fn(data, os[j], os[j-1]) {
   493  					break
   494  				}
   495  				os[j], os[j-1] = os[j-1], os[j]
   496  			}
   497  		}
   498  	}
   499  	return false
   500  }
   501  
   502  // breakPatterns scatters some elements around in an attempt to break some patterns
   503  // that might cause imbalanced partitions in quicksort.
   504  func breakPatterns[T any](data []T, a, b int, os []int64) {
   505  	length := b - a
   506  	if length >= 8 {
   507  		random := xorshift(length)
   508  		modulus := nextPowerOfTwo(length)
   509  
   510  		for idx := a + (length/4)*2 - 1; idx <= a+(length/4)*2+1; idx++ {
   511  			other := int(uint(random.Next()) & (modulus - 1))
   512  			if other >= length {
   513  				other -= length
   514  			}
   515  			os[idx], os[a+other] = os[a+other], os[idx]
   516  		}
   517  	}
   518  }
   519  
   520  // choosePivot chooses a pivot in data[a:b].
   521  //
   522  // [0,8): chooses a static pivot.
   523  // [8,shortestNinther): uses the simple median-of-three method.
   524  // [shortestNinther,∞): uses the Tukey ninther method.
   525  func choosePivot[T any](data []T, a, b int, os []int64, fn func([]T, int64, int64) bool) (pivot int, hint sortedHint) {
   526  	const (
   527  		shortestNinther = 50
   528  		maxSwaps        = 4 * 3
   529  	)
   530  
   531  	l := b - a
   532  
   533  	var (
   534  		swaps int
   535  		i     = a + l/4*1
   536  		j     = a + l/4*2
   537  		k     = a + l/4*3
   538  	)
   539  
   540  	if l >= 8 {
   541  		if l >= shortestNinther {
   542  			// Tukey ninther method, the idea came from Rust's implementation.
   543  			i = medianAdjacent(data, i, &swaps, os, fn)
   544  			j = medianAdjacent(data, j, &swaps, os, fn)
   545  			k = medianAdjacent(data, k, &swaps, os, fn)
   546  		}
   547  		// Find the median among i, j, k and stores it into j.
   548  		j = median(data, i, j, k, &swaps, os, fn)
   549  	}
   550  
   551  	switch swaps {
   552  	case 0:
   553  		return j, increasingHint
   554  	case maxSwaps:
   555  		return j, decreasingHint
   556  	default:
   557  		return j, unknownHint
   558  	}
   559  }
   560  
   561  // order2 returns x,y where data[x] <= data[y], where x,y=a,b or x,y=b,a.
   562  func order2[T any](data []T, a, b int, swaps *int, os []int64, fn func([]T, int64, int64) bool) (int, int) {
   563  	if fn(data, os[b], os[a]) {
   564  		*swaps++
   565  		return b, a
   566  	}
   567  	return a, b
   568  }
   569  
   570  // median returns x where data[x] is the median of data[a],data[b],data[c], where x is a, b, or c.
   571  func median[T any](data []T, a, b, c int, swaps *int, os []int64, fn func([]T, int64, int64) bool) int {
   572  	a, b = order2(data, a, b, swaps, os, fn)
   573  	b, _ = order2(data, b, c, swaps, os, fn)
   574  	_, b = order2(data, a, b, swaps, os, fn)
   575  	return b
   576  }
   577  
   578  // medianAdjacent finds the median of data[a - 1], data[a], data[a + 1] and stores the index into a.
   579  func medianAdjacent[T any](data []T, a int, swaps *int, os []int64, fn func([]T, int64, int64) bool) int {
   580  	return median(data, a-1, a, a+1, swaps, os, fn)
   581  }
   582  
   583  func reverseRange[T any](data []T, a, b int, os []int64, fn func([]T, int64, int64) bool) {
   584  	i := a
   585  	j := b - 1
   586  	for i < j {
   587  		os[i], os[j] = os[j], os[i]
   588  		i++
   589  		j--
   590  	}
   591  }