github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/page_bounds_amd64.go (about) 1 //go:build !purego 2 3 package parquet 4 5 // The min-max algorithms combine looking for the min and max values in a single 6 // pass over the data. While the behavior is the same as calling functions to 7 // look for the min and max values independently, doing both operations at the 8 // same time means that we only load the data from memory once. When working on 9 // large arrays the algorithms are limited by memory bandwidth, computing both 10 // the min and max together shrinks by half the amount of data read from memory. 11 // 12 // The following benchmarks results were highlighting the benefits of combining 13 // the min-max search, compared to calling the min and max functions separately: 14 // 15 // name old time/op new time/op delta 16 // BoundsInt64/10240KiB 590µs ±15% 330µs ±10% -44.01% (p=0.000 n=10+10) 17 // 18 // name old speed new speed delta 19 // BoundsInt64/10240KiB 17.9GB/s ±13% 31.8GB/s ±11% +78.13% (p=0.000 n=10+10) 20 // 21 // As expected, since the functions are memory-bound in those cases, and load 22 // half as much data, we see significant improvements. The gains are not 2x because 23 // running more AVX-512 instructions in the tight loops causes more contention 24 // on CPU ports. 25 // 26 // 27 // Optimizations being trade offs, using min/max functions independently appears 28 // to yield better throughput when the data resides in CPU caches: 29 // 30 // name old time/op new time/op delta 31 // BoundsInt64/4KiB 52.1ns ± 0% 46.2ns ± 1% -12.65% (p=0.000 n=10+10) 32 // 33 // name old speed new speed delta 34 // BoundsInt64/4KiB 78.6GB/s ± 0% 88.6GB/s ± 1% +11.23% (p=0.000 n=10+10) 35 // 36 // The probable explanation is that in those cases the algorithms are not 37 // memory-bound anymore, but limited by contention on CPU ports, and the 38 // individual min/max functions are able to better parallelize the work due 39 // to running less instructions per loop. The performance starts to equalize 40 // around 256KiB, and degrade beyond 1MiB, so we use this threshold to determine 41 // which approach to prefer. 42 const combinedBoundsThreshold = 1 * 1024 * 1024 43 44 //go:noescape 45 func combinedBoundsBool(data []bool) (min, max bool) 46 47 //go:noescape 48 func combinedBoundsInt32(data []int32) (min, max int32) 49 50 //go:noescape 51 func combinedBoundsInt64(data []int64) (min, max int64) 52 53 //go:noescape 54 func combinedBoundsUint32(data []uint32) (min, max uint32) 55 56 //go:noescape 57 func combinedBoundsUint64(data []uint64) (min, max uint64) 58 59 //go:noescape 60 func combinedBoundsFloat32(data []float32) (min, max float32) 61 62 //go:noescape 63 func combinedBoundsFloat64(data []float64) (min, max float64) 64 65 //go:noescape 66 func combinedBoundsBE128(data [][16]byte) (min, max []byte) 67 68 func boundsInt32(data []int32) (min, max int32) { 69 if 4*len(data) >= combinedBoundsThreshold { 70 return combinedBoundsInt32(data) 71 } 72 min = minInt32(data) 73 max = maxInt32(data) 74 return 75 } 76 77 func boundsInt64(data []int64) (min, max int64) { 78 if 8*len(data) >= combinedBoundsThreshold { 79 return combinedBoundsInt64(data) 80 } 81 min = minInt64(data) 82 max = maxInt64(data) 83 return 84 } 85 86 func boundsUint32(data []uint32) (min, max uint32) { 87 if 4*len(data) >= combinedBoundsThreshold { 88 return combinedBoundsUint32(data) 89 } 90 min = minUint32(data) 91 max = maxUint32(data) 92 return 93 } 94 95 func boundsUint64(data []uint64) (min, max uint64) { 96 if 8*len(data) >= combinedBoundsThreshold { 97 return combinedBoundsUint64(data) 98 } 99 min = minUint64(data) 100 max = maxUint64(data) 101 return 102 } 103 104 func boundsFloat32(data []float32) (min, max float32) { 105 if 4*len(data) >= combinedBoundsThreshold { 106 return combinedBoundsFloat32(data) 107 } 108 min = minFloat32(data) 109 max = maxFloat32(data) 110 return 111 } 112 113 func boundsFloat64(data []float64) (min, max float64) { 114 if 8*len(data) >= combinedBoundsThreshold { 115 return combinedBoundsFloat64(data) 116 } 117 min = minFloat64(data) 118 max = maxFloat64(data) 119 return 120 } 121 122 func boundsBE128(data [][16]byte) (min, max []byte) { 123 // TODO: min/max BE128 is really complex to vectorize, and the returns 124 // were barely better than doing the min and max independently, for all 125 // input sizes. We should revisit if we find ways to improve the min or 126 // max algorithms which can be transposed to the combined version. 127 min = minBE128(data) 128 max = maxBE128(data) 129 return 130 }