github.com/cockroachdb/cockroachdb-parser@v0.23.3-0.20240213214944-911057d40c9a/pkg/util/unique/unique.go (about)

     1  // Copyright 2020 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package unique
    12  
    13  import (
    14  	"bytes"
    15  	"reflect"
    16  	"sort"
    17  )
    18  
    19  // UniquifyByteSlices takes as input a slice of slices of bytes, and
    20  // deduplicates them using a sort and unique. The output will not contain any
    21  // duplicates but it will be sorted.
    22  func UniquifyByteSlices(slices [][]byte) [][]byte {
    23  	if len(slices) == 0 {
    24  		return slices
    25  	}
    26  	// First sort:
    27  	sort.Slice(slices, func(i int, j int) bool {
    28  		return bytes.Compare(slices[i], slices[j]) < 0
    29  	})
    30  	// Then distinct: (wouldn't it be nice if Go had generics?)
    31  	lastUniqueIdx := 0
    32  	for i := 1; i < len(slices); i++ {
    33  		if !bytes.Equal(slices[i], slices[lastUniqueIdx]) {
    34  			// We found a unique entry, at index i. The last unique entry in the array
    35  			// was at lastUniqueIdx, so set the entry after that one to our new unique
    36  			// entry, and bump lastUniqueIdx for the next loop iteration.
    37  			lastUniqueIdx++
    38  			slices[lastUniqueIdx] = slices[i]
    39  		}
    40  	}
    41  	slices = slices[:lastUniqueIdx+1]
    42  	return slices
    43  }
    44  
    45  // UniquifyAcrossSlices removes elements from both slices that are duplicated
    46  // across both of the slices. For example, inputs [1,2,3], [2,3,4] would remove
    47  // 2 and 3 from both lists.
    48  // It assumes that both slices are pre-sorted using the same comparison metric
    49  // as cmpFunc provides, and also already free of duplicates internally. It
    50  // returns the slices, which will have also been sorted as a side effect.
    51  // cmpFunc compares the lth index of left to the rth index of right. It must
    52  // return less than 0 if the left element is less than the right element, 0 if
    53  // equal, and greater than 0 otherwise.
    54  // setLeft sets the ith index of left to the jth index of left.
    55  // setRight sets the ith index of right to the jth index of right.
    56  // The function returns the new lengths of both input slices, whose elements
    57  // will have been mutated, but whose lengths must be set the new lengths by
    58  // the caller.
    59  func UniquifyAcrossSlices(
    60  	left interface{},
    61  	right interface{},
    62  	cmpFunc func(l, r int) int,
    63  	setLeft func(i, j int),
    64  	setRight func(i, j int),
    65  ) (leftLen, rightLen int) {
    66  	leftSlice := reflect.ValueOf(left)
    67  	rightSlice := reflect.ValueOf(right)
    68  
    69  	lLen := leftSlice.Len()
    70  	rLen := rightSlice.Len()
    71  
    72  	var lIn, lOut int
    73  	var rIn, rOut int
    74  
    75  	// Remove entries that are duplicated across both entry lists.
    76  	// This loop walks through both lists using a merge strategy. Two pointers per
    77  	// list are maintained. One is the "input pointer", which is always the ith
    78  	// element of the input list. One is the "output pointer", which is the index
    79  	// after the most recent unique element in the list. Every time we bump the
    80  	// input pointer, we also set the element at the output pointer to that at
    81  	// the input pointer, so we don't have to use extra space - we're
    82  	// deduplicating in-place.
    83  	for rIn < rLen || lIn < lLen {
    84  		var cmp int
    85  		if lIn == lLen {
    86  			cmp = 1
    87  		} else if rIn == rLen {
    88  			cmp = -1
    89  		} else {
    90  			cmp = cmpFunc(lIn, rIn)
    91  		}
    92  		if cmp < 0 {
    93  			setLeft(lOut, lIn)
    94  			lIn++
    95  			lOut++
    96  		} else if cmp > 0 {
    97  			setRight(rOut, rIn)
    98  			rIn++
    99  			rOut++
   100  		} else {
   101  			// Elements are identical - we want to remove them from the list. So
   102  			// we increment our input indices without touching our output indices.
   103  			// Next time through the loop, we'll shift the next element back to
   104  			// the last output index which is now lagging behind the input index.
   105  			lIn++
   106  			rIn++
   107  		}
   108  	}
   109  	return lOut, rOut
   110  }