github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/internal/mkbench/split.go (about)

     1  package main
     2  
     3  import "sort"
     4  
     5  const increment = 50 // ops/sec
     6  
     7  // findOptimalSplit computes and returns a value that separates the given pass
     8  // and fail measurements optimally, such that the number of mis-classified
     9  // passes (pass values that fall above the split) and fails (fail values that
    10  // fall below the split) is minimized.
    11  //
    12  // The following gives a visual representation of the problem:
    13  //
    14  //		                     Optimal partition (=550) -----> |
    15  //	                                                         |
    16  //	  Passes:   o          o        o              o o o oo  |
    17  //	  Fails:                         x             x         |x    x  x     x x        x
    18  //	  |---------|---------|---------|---------|---------|----|----|---------|---------|---------|---> x
    19  //	  0        100       200       300       400       500   |   600       700       800       900
    20  //
    21  // The algorithm works by computing the error (i.e. mis-classifications) at
    22  // various points along the x-axis, starting from the origin and increasing by
    23  // the given increment.
    24  func findOptimalSplit(pass, fail []int) int {
    25  	// Not enough data to compute a sensible score.
    26  	if len(pass) == 0 || len(fail) == 0 {
    27  		return -1
    28  	}
    29  
    30  	// Maintain counters for the number of incorrectly classified passes and
    31  	// fails. All passes are initially incorrect, as we start at 0. Conversely,
    32  	// no fails are incorrectly classified, as all scores are >= 0.
    33  	pCount, fCount := len(pass), 0
    34  	p, f := make([]int, len(pass)), make([]int, len(fail))
    35  	copy(p, pass)
    36  	copy(f, fail)
    37  
    38  	// Sort the inputs.
    39  	sort.Slice(p, func(i, j int) bool {
    40  		return p[i] < p[j]
    41  	})
    42  	sort.Slice(f, func(i, j int) bool {
    43  		return f[i] < f[j]
    44  	})
    45  
    46  	// Find the global min and max.
    47  	min, max := p[0], f[len(fail)-1]
    48  
    49  	// Iterate over the range in increments.
    50  	var result [][]int
    51  	for x := min; x <= max; x = x + increment {
    52  		// Reduce the count of incorrect passes as x increases (i.e. fewer pass
    53  		// values are incorrect as x increases).
    54  		for len(p) > 0 && p[0] <= x {
    55  			pCount--
    56  			p = p[1:]
    57  		}
    58  
    59  		// Increase the count of incorrect fails as x increases (i.e. more fail
    60  		// values are incorrect as x increases).
    61  		for len(f) > 0 && f[0] < x {
    62  			fCount++
    63  			f = f[1:]
    64  		}
    65  
    66  		// Add a (x, score) tuple to result slice.
    67  		result = append(result, []int{x, pCount + fCount})
    68  	}
    69  
    70  	// Sort the (x, score) result slice by score ascending. Tie-break by x
    71  	// ascending.
    72  	sort.Slice(result, func(i, j int) bool {
    73  		if result[i][1] == result[j][1] {
    74  			return result[i][0] < result[j][0]
    75  		}
    76  		return result[i][1] < result[j][1]
    77  	})
    78  
    79  	// If there is more than one interval, split the difference between the min
    80  	// and the max.
    81  	splitMin, splitMax := result[0][0], result[0][0]
    82  	for i := 1; i < len(result); i++ {
    83  		if result[i][1] != result[0][1] {
    84  			break
    85  		}
    86  		splitMax = result[i][0]
    87  	}
    88  
    89  	return (splitMin + splitMax) / 2
    90  }