github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/page_bounds_amd64.go (about) 1 //go:build !purego 2 3 package parquet 4 5 // The min-max algorithms combine looking for the min and max values in a single 6 // pass over the data. While the behavior is the same as calling functions to 7 // look for the min and max values independently, doing both operations at the 8 // same time means that we only load the data from memory once. When working on 9 // large arrays the algorithms are limited by memory bandwidth, computing both 10 // the min and max together shrinks by half the amount of data read from memory. 11 // 12 // The following benchmarks results were highlighting the benefits of combining 13 // the min-max search, compared to calling the min and max functions separately: 14 // 15 // name old time/op new time/op delta 16 // BoundsInt64/10240KiB 590µs ±15% 330µs ±10% -44.01% (p=0.000 n=10+10) 17 // 18 // name old speed new speed delta 19 // BoundsInt64/10240KiB 17.9GB/s ±13% 31.8GB/s ±11% +78.13% (p=0.000 n=10+10) 20 // 21 // As expected, since the functions are memory-bound in those cases, and load 22 // half as much data, we see significant improvements. The gains are not 2x because 23 // running more AVX-512 instructions in the tight loops causes more contention 24 // on CPU ports. 25 // 26 // Optimizations being trade offs, using min/max functions independently appears 27 // to yield better throughput when the data resides in CPU caches: 28 // 29 // name old time/op new time/op delta 30 // BoundsInt64/4KiB 52.1ns ± 0% 46.2ns ± 1% -12.65% (p=0.000 n=10+10) 31 // 32 // name old speed new speed delta 33 // BoundsInt64/4KiB 78.6GB/s ± 0% 88.6GB/s ± 1% +11.23% (p=0.000 n=10+10) 34 // 35 // The probable explanation is that in those cases the algorithms are not 36 // memory-bound anymore, but limited by contention on CPU ports, and the 37 // individual min/max functions are able to better parallelize the work due 38 // to running less instructions per loop. The performance starts to equalize 39 // around 256KiB, and degrade beyond 1MiB, so we use this threshold to determine 40 // which approach to prefer. 41 const combinedBoundsThreshold = 1 * 1024 * 1024 42 43 //go:noescape 44 func combinedBoundsBool(data []bool) (min, max bool) 45 46 //go:noescape 47 func combinedBoundsInt32(data []int32) (min, max int32) 48 49 //go:noescape 50 func combinedBoundsInt64(data []int64) (min, max int64) 51 52 //go:noescape 53 func combinedBoundsUint32(data []uint32) (min, max uint32) 54 55 //go:noescape 56 func combinedBoundsUint64(data []uint64) (min, max uint64) 57 58 //go:noescape 59 func combinedBoundsFloat32(data []float32) (min, max float32) 60 61 //go:noescape 62 func combinedBoundsFloat64(data []float64) (min, max float64) 63 64 //go:noescape 65 func combinedBoundsBE128(data [][16]byte) (min, max []byte) 66 67 func boundsInt32(data []int32) (min, max int32) { 68 if 4*len(data) >= combinedBoundsThreshold { 69 return combinedBoundsInt32(data) 70 } 71 min = minInt32(data) 72 max = maxInt32(data) 73 return 74 } 75 76 func boundsInt64(data []int64) (min, max int64) { 77 if 8*len(data) >= combinedBoundsThreshold { 78 return combinedBoundsInt64(data) 79 } 80 min = minInt64(data) 81 max = maxInt64(data) 82 return 83 } 84 85 func boundsUint32(data []uint32) (min, max uint32) { 86 if 4*len(data) >= combinedBoundsThreshold { 87 return combinedBoundsUint32(data) 88 } 89 min = minUint32(data) 90 max = maxUint32(data) 91 return 92 } 93 94 func boundsUint64(data []uint64) (min, max uint64) { 95 if 8*len(data) >= combinedBoundsThreshold { 96 return combinedBoundsUint64(data) 97 } 98 min = minUint64(data) 99 max = maxUint64(data) 100 return 101 } 102 103 func boundsFloat32(data []float32) (min, max float32) { 104 if 4*len(data) >= combinedBoundsThreshold { 105 return combinedBoundsFloat32(data) 106 } 107 min = minFloat32(data) 108 max = maxFloat32(data) 109 return 110 } 111 112 func boundsFloat64(data []float64) (min, max float64) { 113 if 8*len(data) >= combinedBoundsThreshold { 114 return combinedBoundsFloat64(data) 115 } 116 min = minFloat64(data) 117 max = maxFloat64(data) 118 return 119 } 120 121 func boundsBE128(data [][16]byte) (min, max []byte) { 122 // TODO: min/max BE128 is really complex to vectorize, and the returns 123 // were barely better than doing the min and max independently, for all 124 // input sizes. We should revisit if we find ways to improve the min or 125 // max algorithms which can be transposed to the combined version. 126 min = minBE128(data) 127 max = maxBE128(data) 128 return 129 }