github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/page_bounds_amd64.go (about)

     1  //go:build !purego
     2  
     3  package parquet
     4  
     5  // The min-max algorithms combine looking for the min and max values in a single
     6  // pass over the data. While the behavior is the same as calling functions to
     7  // look for the min and max values independently, doing both operations at the
     8  // same time means that we only load the data from memory once. When working on
     9  // large arrays the algorithms are limited by memory bandwidth, computing both
    10  // the min and max together shrinks by half the amount of data read from memory.
    11  //
    12  // The following benchmarks results were highlighting the benefits of combining
    13  // the min-max search, compared to calling the min and max functions separately:
    14  //
    15  // name                 old time/op    new time/op    delta
    16  // BoundsInt64/10240KiB    590µs ±15%     330µs ±10%  -44.01%  (p=0.000 n=10+10)
    17  //
    18  // name                 old speed      new speed      delta
    19  // BoundsInt64/10240KiB 17.9GB/s ±13%  31.8GB/s ±11%  +78.13%  (p=0.000 n=10+10)
    20  //
    21  // As expected, since the functions are memory-bound in those cases, and load
    22  // half as much data, we see significant improvements. The gains are not 2x because
    23  // running more AVX-512 instructions in the tight loops causes more contention
    24  // on CPU ports.
    25  //
    26  // Optimizations being trade offs, using min/max functions independently appears
    27  // to yield better throughput when the data resides in CPU caches:
    28  //
    29  // name             old time/op    new time/op    delta
    30  // BoundsInt64/4KiB   52.1ns ± 0%    46.2ns ± 1%  -12.65%  (p=0.000 n=10+10)
    31  //
    32  // name             old speed      new speed      delta
    33  // BoundsInt64/4KiB 78.6GB/s ± 0%  88.6GB/s ± 1%  +11.23%  (p=0.000 n=10+10)
    34  //
    35  // The probable explanation is that in those cases the algorithms are not
    36  // memory-bound anymore, but limited by contention on CPU ports, and the
    37  // individual min/max functions are able to better parallelize the work due
    38  // to running less instructions per loop. The performance starts to equalize
    39  // around 256KiB, and degrade beyond 1MiB, so we use this threshold to determine
    40  // which approach to prefer.
    41  const combinedBoundsThreshold = 1 * 1024 * 1024
    42  
    43  //go:noescape
    44  func combinedBoundsBool(data []bool) (min, max bool)
    45  
    46  //go:noescape
    47  func combinedBoundsInt32(data []int32) (min, max int32)
    48  
    49  //go:noescape
    50  func combinedBoundsInt64(data []int64) (min, max int64)
    51  
    52  //go:noescape
    53  func combinedBoundsUint32(data []uint32) (min, max uint32)
    54  
    55  //go:noescape
    56  func combinedBoundsUint64(data []uint64) (min, max uint64)
    57  
    58  //go:noescape
    59  func combinedBoundsFloat32(data []float32) (min, max float32)
    60  
    61  //go:noescape
    62  func combinedBoundsFloat64(data []float64) (min, max float64)
    63  
    64  //go:noescape
    65  func combinedBoundsBE128(data [][16]byte) (min, max []byte)
    66  
    67  func boundsInt32(data []int32) (min, max int32) {
    68  	if 4*len(data) >= combinedBoundsThreshold {
    69  		return combinedBoundsInt32(data)
    70  	}
    71  	min = minInt32(data)
    72  	max = maxInt32(data)
    73  	return
    74  }
    75  
    76  func boundsInt64(data []int64) (min, max int64) {
    77  	if 8*len(data) >= combinedBoundsThreshold {
    78  		return combinedBoundsInt64(data)
    79  	}
    80  	min = minInt64(data)
    81  	max = maxInt64(data)
    82  	return
    83  }
    84  
    85  func boundsUint32(data []uint32) (min, max uint32) {
    86  	if 4*len(data) >= combinedBoundsThreshold {
    87  		return combinedBoundsUint32(data)
    88  	}
    89  	min = minUint32(data)
    90  	max = maxUint32(data)
    91  	return
    92  }
    93  
    94  func boundsUint64(data []uint64) (min, max uint64) {
    95  	if 8*len(data) >= combinedBoundsThreshold {
    96  		return combinedBoundsUint64(data)
    97  	}
    98  	min = minUint64(data)
    99  	max = maxUint64(data)
   100  	return
   101  }
   102  
   103  func boundsFloat32(data []float32) (min, max float32) {
   104  	if 4*len(data) >= combinedBoundsThreshold {
   105  		return combinedBoundsFloat32(data)
   106  	}
   107  	min = minFloat32(data)
   108  	max = maxFloat32(data)
   109  	return
   110  }
   111  
   112  func boundsFloat64(data []float64) (min, max float64) {
   113  	if 8*len(data) >= combinedBoundsThreshold {
   114  		return combinedBoundsFloat64(data)
   115  	}
   116  	min = minFloat64(data)
   117  	max = maxFloat64(data)
   118  	return
   119  }
   120  
   121  func boundsBE128(data [][16]byte) (min, max []byte) {
   122  	// TODO: min/max BE128 is really complex to vectorize, and the returns
   123  	// were barely better than doing the min and max independently, for all
   124  	// input sizes. We should revisit if we find ways to improve the min or
   125  	// max algorithms which can be transposed to the combined version.
   126  	min = minBE128(data)
   127  	max = maxBE128(data)
   128  	return
   129  }