github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/simd/multi_benchmark_test.go (about) 1 // Copyright 2019 GRAIL, Inc. All rights reserved. 2 // Use of this source code is governed by the Apache-2.0 3 // license that can be found in the LICENSE file. 4 5 package simd_test 6 7 import ( 8 "runtime" 9 "testing" 10 11 "github.com/Schaudge/grailbase/simd" 12 "github.com/Schaudge/grailbase/traverse" 13 ) 14 15 // Utility functions to assist with benchmarking of embarrassingly parallel 16 // jobs. It probably makes sense to move this code to a more central location 17 // at some point. 18 19 type multiBenchFunc func(dst, src []byte, nIter int) int 20 21 type taggedMultiBenchFunc struct { 22 f multiBenchFunc 23 tag string 24 } 25 26 type bytesInitFunc func(src []byte) 27 28 type multiBenchmarkOpts struct { 29 dstInit bytesInitFunc 30 srcInit bytesInitFunc 31 } 32 33 func multiBenchmark(bf multiBenchFunc, benchmarkSubtype string, nDstByte, nSrcByte, nJob int, b *testing.B, opts ...multiBenchmarkOpts) { 34 // 'bf' is expected to execute the benchmarking target nIter times. 35 // 36 // Given that, for each of the 3 nCpu settings below, multiBenchmark launches 37 // 'parallelism' goroutines, where each goroutine has nIter set to roughly 38 // (nJob / nCpu), so that the total number of benchmark-target-function 39 // invocations across all threads is nJob. It is designed to measure how 40 // effective traverse.Each-style parallelization is at reducing wall-clock 41 // runtime. 42 totalCpu := runtime.NumCPU() 43 cases := []struct { 44 nCpu int 45 descrip string 46 }{ 47 { 48 nCpu: 1, 49 descrip: "1Cpu", 50 }, 51 // 'Half' is often the saturation point, due to hyperthreading. 52 { 53 nCpu: (totalCpu + 1) / 2, 54 descrip: "HalfCpu", 55 }, 56 { 57 nCpu: totalCpu, 58 descrip: "AllCpu", 59 }, 60 } 61 var dstInit bytesInitFunc 62 var srcInit bytesInitFunc 63 if len(opts) >= 1 { 64 dstInit = opts[0].dstInit 65 srcInit = opts[0].srcInit 66 } 67 for _, c := range cases { 68 success := b.Run(benchmarkSubtype+c.descrip, func(b *testing.B) { 69 dsts := make([][]byte, c.nCpu) 70 srcs := make([][]byte, c.nCpu) 71 for i := 0; i < c.nCpu; i++ { 72 // Add 63 to prevent false sharing. 73 newArrDst := simd.MakeUnsafe(nDstByte + 63) 74 newArrSrc := simd.MakeUnsafe(nSrcByte + 63) 75 if i == 0 { 76 if dstInit != nil { 77 dstInit(newArrDst) 78 } 79 if srcInit != nil { 80 srcInit(newArrSrc) 81 } else { 82 for j := 0; j < nSrcByte; j++ { 83 newArrSrc[j] = byte(j * 3) 84 } 85 } 86 } else { 87 if dstInit != nil { 88 copy(newArrDst[:nDstByte], dsts[0]) 89 } 90 copy(newArrSrc[:nSrcByte], srcs[0]) 91 } 92 dsts[i] = newArrDst[:nDstByte] 93 srcs[i] = newArrSrc[:nSrcByte] 94 } 95 b.ResetTimer() 96 for i := 0; i < b.N; i++ { 97 // May want to replace this with something based on testing.B's 98 // RunParallel method. (Haven't done so yet since I don't see a clean 99 // way to make that play well with per-core preallocated buffers.) 100 _ = traverse.Each(c.nCpu, func(threadIdx int) error { 101 nIter := (((threadIdx + 1) * nJob) / c.nCpu) - ((threadIdx * nJob) / c.nCpu) 102 _ = bf(dsts[threadIdx], srcs[threadIdx], nIter) 103 return nil 104 }) 105 } 106 }) 107 if !success { 108 panic("benchmark failed") 109 } 110 } 111 } 112 113 func bytesInit0(src []byte) { 114 // do nothing 115 } 116 117 func bytesInitMax15(src []byte) { 118 for i := 0; i < len(src); i++ { 119 src[i] = byte(i*3) & 15 120 } 121 } 122 123 type multiBenchVarargsFunc func(args interface{}, nIter int) int 124 125 type taggedMultiBenchVarargsFunc struct { 126 f multiBenchVarargsFunc 127 tag string 128 } 129 130 type varargsFactory func() interface{} 131 132 func multiBenchmarkVarargs(bvf multiBenchVarargsFunc, benchmarkSubtype string, nJob int, argsFactory varargsFactory, b *testing.B) { 133 totalCpu := runtime.NumCPU() 134 cases := []struct { 135 nCpu int 136 descrip string 137 }{ 138 { 139 nCpu: 1, 140 descrip: "1Cpu", 141 }, 142 { 143 nCpu: (totalCpu + 1) / 2, 144 descrip: "HalfCpu", 145 }, 146 { 147 nCpu: totalCpu, 148 descrip: "AllCpu", 149 }, 150 } 151 for _, c := range cases { 152 success := b.Run(benchmarkSubtype+c.descrip, func(b *testing.B) { 153 var argSlice []interface{} 154 for i := 0; i < c.nCpu; i++ { 155 // Can take an "args interface{}" parameter and make deep copies 156 // instead. 157 argSlice = append(argSlice, argsFactory()) 158 } 159 b.ResetTimer() 160 for i := 0; i < b.N; i++ { 161 _ = traverse.Each(c.nCpu, func(threadIdx int) error { 162 nIter := (((threadIdx + 1) * nJob) / c.nCpu) - ((threadIdx * nJob) / c.nCpu) 163 _ = bvf(argSlice[threadIdx], nIter) 164 return nil 165 }) 166 } 167 }) 168 if !success { 169 panic("benchmark failed") 170 } 171 } 172 }