github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/page_bounds_amd64.go (about)

     1  //go:build !purego
     2  
     3  package parquet
     4  
     5  // The min-max algorithms combine looking for the min and max values in a single
     6  // pass over the data. While the behavior is the same as calling functions to
     7  // look for the min and max values independently, doing both operations at the
     8  // same time means that we only load the data from memory once. When working on
     9  // large arrays the algorithms are limited by memory bandwidth, computing both
    10  // the min and max together shrinks by half the amount of data read from memory.
    11  //
    12  // The following benchmarks results were highlighting the benefits of combining
    13  // the min-max search, compared to calling the min and max functions separately:
    14  //
    15  // name                 old time/op    new time/op    delta
    16  // BoundsInt64/10240KiB    590µs ±15%     330µs ±10%  -44.01%  (p=0.000 n=10+10)
    17  //
    18  // name                 old speed      new speed      delta
    19  // BoundsInt64/10240KiB 17.9GB/s ±13%  31.8GB/s ±11%  +78.13%  (p=0.000 n=10+10)
    20  //
    21  // As expected, since the functions are memory-bound in those cases, and load
    22  // half as much data, we see significant improvements. The gains are not 2x because
    23  // running more AVX-512 instructions in the tight loops causes more contention
    24  // on CPU ports.
    25  //
    26  //
    27  // Optimizations being trade offs, using min/max functions independently appears
    28  // to yield better throughput when the data resides in CPU caches:
    29  //
    30  // name             old time/op    new time/op    delta
    31  // BoundsInt64/4KiB   52.1ns ± 0%    46.2ns ± 1%  -12.65%  (p=0.000 n=10+10)
    32  //
    33  // name             old speed      new speed      delta
    34  // BoundsInt64/4KiB 78.6GB/s ± 0%  88.6GB/s ± 1%  +11.23%  (p=0.000 n=10+10)
    35  //
    36  // The probable explanation is that in those cases the algorithms are not
    37  // memory-bound anymore, but limited by contention on CPU ports, and the
    38  // individual min/max functions are able to better parallelize the work due
    39  // to running less instructions per loop. The performance starts to equalize
    40  // around 256KiB, and degrade beyond 1MiB, so we use this threshold to determine
    41  // which approach to prefer.
    42  const combinedBoundsThreshold = 1 * 1024 * 1024
    43  
    44  //go:noescape
    45  func combinedBoundsBool(data []bool) (min, max bool)
    46  
    47  //go:noescape
    48  func combinedBoundsInt32(data []int32) (min, max int32)
    49  
    50  //go:noescape
    51  func combinedBoundsInt64(data []int64) (min, max int64)
    52  
    53  //go:noescape
    54  func combinedBoundsUint32(data []uint32) (min, max uint32)
    55  
    56  //go:noescape
    57  func combinedBoundsUint64(data []uint64) (min, max uint64)
    58  
    59  //go:noescape
    60  func combinedBoundsFloat32(data []float32) (min, max float32)
    61  
    62  //go:noescape
    63  func combinedBoundsFloat64(data []float64) (min, max float64)
    64  
    65  //go:noescape
    66  func combinedBoundsBE128(data [][16]byte) (min, max []byte)
    67  
    68  func boundsInt32(data []int32) (min, max int32) {
    69  	if 4*len(data) >= combinedBoundsThreshold {
    70  		return combinedBoundsInt32(data)
    71  	}
    72  	min = minInt32(data)
    73  	max = maxInt32(data)
    74  	return
    75  }
    76  
    77  func boundsInt64(data []int64) (min, max int64) {
    78  	if 8*len(data) >= combinedBoundsThreshold {
    79  		return combinedBoundsInt64(data)
    80  	}
    81  	min = minInt64(data)
    82  	max = maxInt64(data)
    83  	return
    84  }
    85  
    86  func boundsUint32(data []uint32) (min, max uint32) {
    87  	if 4*len(data) >= combinedBoundsThreshold {
    88  		return combinedBoundsUint32(data)
    89  	}
    90  	min = minUint32(data)
    91  	max = maxUint32(data)
    92  	return
    93  }
    94  
    95  func boundsUint64(data []uint64) (min, max uint64) {
    96  	if 8*len(data) >= combinedBoundsThreshold {
    97  		return combinedBoundsUint64(data)
    98  	}
    99  	min = minUint64(data)
   100  	max = maxUint64(data)
   101  	return
   102  }
   103  
   104  func boundsFloat32(data []float32) (min, max float32) {
   105  	if 4*len(data) >= combinedBoundsThreshold {
   106  		return combinedBoundsFloat32(data)
   107  	}
   108  	min = minFloat32(data)
   109  	max = maxFloat32(data)
   110  	return
   111  }
   112  
   113  func boundsFloat64(data []float64) (min, max float64) {
   114  	if 8*len(data) >= combinedBoundsThreshold {
   115  		return combinedBoundsFloat64(data)
   116  	}
   117  	min = minFloat64(data)
   118  	max = maxFloat64(data)
   119  	return
   120  }
   121  
   122  func boundsBE128(data [][16]byte) (min, max []byte) {
   123  	// TODO: min/max BE128 is really complex to vectorize, and the returns
   124  	// were barely better than doing the min and max independently, for all
   125  	// input sizes. We should revisit if we find ways to improve the min or
   126  	// max algorithms which can be transposed to the combined version.
   127  	min = minBE128(data)
   128  	max = maxBE128(data)
   129  	return
   130  }