github.com/grailbio/base@v0.0.11/simd/multibyte_test.go (about) 1 // Copyright 2018 GRAIL, Inc. All rights reserved. 2 // Use of this source code is governed by the Apache-2.0 3 // license that can be found in the LICENSE file. 4 5 // +build !appengine 6 7 package simd_test 8 9 import ( 10 "math/rand" 11 "reflect" 12 "testing" 13 "unsafe" 14 15 "github.com/grailbio/base/simd" 16 "github.com/grailbio/testutil/expect" 17 ) 18 19 // The compiler clearly recognizes this; performance is almost 20 // indistinguishable from handcoded assembly. 21 func memset32Builtin(dst []uint32, val uint32) { 22 for idx := range dst { 23 dst[idx] = val 24 } 25 } 26 27 func TestMemset32(t *testing.T) { 28 maxSize := 500 29 nIter := 200 30 rand.Seed(1) 31 main1Arr := make([]uint32, maxSize) 32 main2Arr := make([]uint32, maxSize) 33 for iter := 0; iter < nIter; iter++ { 34 sliceStart := rand.Intn(maxSize) 35 sliceEnd := sliceStart + rand.Intn(maxSize-sliceStart) 36 u32Val := rand.Uint32() 37 main1Slice := main1Arr[sliceStart:sliceEnd] 38 main2Slice := main2Arr[sliceStart:sliceEnd] 39 sentinel := rand.Uint32() 40 main2Arr[sliceEnd] = sentinel 41 memset32Builtin(main1Slice, u32Val) 42 main2SliceHeader := (*reflect.SliceHeader)(unsafe.Pointer(&main2Slice)) 43 simd.Memset32Raw(unsafe.Pointer(main2SliceHeader.Data), unsafe.Pointer(&u32Val), main2SliceHeader.Len) 44 if !reflect.DeepEqual(main1Slice, main2Slice) { 45 t.Fatal("Mismatched Memset32Raw result.") 46 } 47 if main2Arr[sliceEnd] != sentinel { 48 t.Fatal("Memset32Raw clobbered an extra byte.") 49 } 50 } 51 } 52 53 func memset16Standard(dst []uint16, val uint16) { 54 // This tends to be better than the range-for loop, though it's less 55 // clear-cut than the memset case. 56 nDst := len(dst) 57 if nDst != 0 { 58 dst[0] = val 59 for i := 1; i < nDst; { 60 i += copy(dst[i:], dst[:i]) 61 } 62 } 63 } 64 65 func TestMemset16(t *testing.T) { 66 maxSize := 500 67 nIter := 200 68 rand.Seed(1) 69 main1Arr := make([]uint16, maxSize) 70 main2Arr := make([]uint16, maxSize) 71 for iter := 0; iter < nIter; iter++ { 72 sliceStart := rand.Intn(maxSize) 73 sliceEnd := sliceStart + rand.Intn(maxSize-sliceStart) 74 u16Val := uint16(rand.Uint32()) 75 main1Slice := main1Arr[sliceStart:sliceEnd] 76 main2Slice := main2Arr[sliceStart:sliceEnd] 77 sentinel := uint16(rand.Uint32()) 78 main2Arr[sliceEnd] = sentinel 79 memset16Standard(main1Slice, u16Val) 80 simd.RepeatU16(main2Slice, u16Val) 81 if !reflect.DeepEqual(main1Slice, main2Slice) { 82 t.Fatal("Mismatched RepeatU16 result.") 83 } 84 if main2Arr[sliceEnd] != sentinel { 85 t.Fatal("RepeatU16 clobbered an extra byte.") 86 } 87 } 88 } 89 90 /* 91 Benchmark results: 92 MacBook Pro (15-inch, 2016) 93 2.7 GHz Intel Core i7, 16 GB 2133 MHz LPDDR3 94 95 Benchmark_Memset16/SIMDShort1Cpu-8 10 140130606 ns/op 96 Benchmark_Memset16/SIMDShortHalfCpu-8 50 37087600 ns/op 97 Benchmark_Memset16/SIMDShortAllCpu-8 50 35361817 ns/op 98 Benchmark_Memset16/SIMDLong1Cpu-8 1 1157494604 ns/op 99 Benchmark_Memset16/SIMDLongHalfCpu-8 2 921843584 ns/op 100 Benchmark_Memset16/SIMDLongAllCpu-8 2 960652822 ns/op 101 Benchmark_Memset16/StandardShort1Cpu-8 5 343877390 ns/op 102 Benchmark_Memset16/StandardShortHalfCpu-8 20 88295789 ns/op 103 Benchmark_Memset16/StandardShortAllCpu-8 20 86026817 ns/op 104 Benchmark_Memset16/StandardLong1Cpu-8 1 1038072481 ns/op 105 Benchmark_Memset16/StandardLongHalfCpu-8 2 979292703 ns/op 106 Benchmark_Memset16/StandardLongAllCpu-8 1 1052316741 ns/op 107 */ 108 109 type u16Args struct { 110 main []uint16 111 } 112 113 func memset16SimdSubtask(args interface{}, nIter int) int { 114 a := args.(u16Args) 115 for iter := 0; iter < nIter; iter++ { 116 simd.RepeatU16(a.main, 0x201) 117 } 118 return int(a.main[0]) 119 } 120 121 func memset16StandardSubtask(args interface{}, nIter int) int { 122 a := args.(u16Args) 123 for iter := 0; iter < nIter; iter++ { 124 memset16Standard(a.main, 0x201) 125 } 126 return int(a.main[0]) 127 } 128 129 func Benchmark_Memset16(b *testing.B) { 130 funcs := []taggedMultiBenchVarargsFunc{ 131 { 132 f: memset16SimdSubtask, 133 tag: "SIMD", 134 }, 135 { 136 f: memset16StandardSubtask, 137 tag: "Standard", 138 }, 139 } 140 for _, f := range funcs { 141 multiBenchmarkVarargs(f.f, f.tag+"Short", 9999999, func() interface{} { 142 return u16Args{ 143 main: make([]uint16, 75, 75+31), 144 } 145 }, b) 146 multiBenchmarkVarargs(f.f, f.tag+"Long", 50, func() interface{} { 147 return u16Args{ 148 main: make([]uint16, 249250622/2, 249250622/2+31), 149 } 150 }, b) 151 } 152 } 153 154 func indexU16Standard(main []uint16, val uint16) int { 155 for i, v := range main { 156 if v == val { 157 return i 158 } 159 } 160 return -1 161 } 162 163 func TestIndexU16(t *testing.T) { 164 // Generate nOuterIter random length-arrLen []uint16s, and perform nInnerIter 165 // random searches on each slice. 166 arrLen := 50000 167 nOuterIter := 5 168 nInnerIter := 100 169 valLimit := 65536 // maximum uint16 is 65535 170 rand.Seed(1) 171 mainArr := make([]uint16, arrLen) 172 for outerIdx := 0; outerIdx < nOuterIter; outerIdx++ { 173 for i := range mainArr { 174 mainArr[i] = uint16(rand.Intn(valLimit)) 175 } 176 for innerIdx := 0; innerIdx < nInnerIter; innerIdx++ { 177 needle := uint16(rand.Intn(valLimit)) 178 expected := indexU16Standard(mainArr, needle) 179 actual := simd.IndexU16(mainArr, needle) 180 expect.EQ(t, expected, actual) 181 } 182 } 183 } 184 185 const indexU16TestLimit = 100 186 187 func indexU16SimdSubtask(args interface{}, nIter int) int { 188 a := args.(u16Args) 189 sum := 0 190 needle := uint16(0) 191 for iter := 0; iter < nIter; iter++ { 192 sum += simd.IndexU16(a.main, needle) 193 needle++ 194 if needle == indexU16TestLimit { 195 needle = 0 196 } 197 } 198 return sum 199 } 200 201 func indexU16StandardSubtask(args interface{}, nIter int) int { 202 a := args.(u16Args) 203 sum := 0 204 needle := uint16(0) 205 for iter := 0; iter < nIter; iter++ { 206 sum += indexU16Standard(a.main, needle) 207 needle++ 208 if needle == indexU16TestLimit { 209 needle = 0 210 } 211 } 212 return sum 213 } 214 215 // Single-threaded performance is ~4x as good in my testing. 216 func Benchmark_IndexU16(b *testing.B) { 217 funcs := []taggedMultiBenchVarargsFunc{ 218 { 219 f: indexU16SimdSubtask, 220 tag: "SIMD", 221 }, 222 { 223 f: indexU16StandardSubtask, 224 tag: "Standard", 225 }, 226 } 227 for _, f := range funcs { 228 multiBenchmarkVarargs(f.f, f.tag+"Long", 50, func() interface{} { 229 return u16Args{ 230 main: make([]uint16, 4000000, 4000000+31), 231 } 232 }, b) 233 } 234 } 235 236 func reverseU16Slow(main []uint16) { 237 nU16 := len(main) 238 nU16Div2 := nU16 >> 1 239 for idx, invIdx := 0, nU16-1; idx != nU16Div2; idx, invIdx = idx+1, invIdx-1 { 240 main[idx], main[invIdx] = main[invIdx], main[idx] 241 } 242 } 243 244 func TestReverse16(t *testing.T) { 245 maxSize := 500 246 nIter := 200 247 rand.Seed(1) 248 main1Arr := make([]uint16, maxSize) 249 main2Arr := make([]uint16, maxSize) 250 main3Arr := make([]uint16, maxSize) 251 src2Arr := make([]uint16, maxSize) 252 for iter := 0; iter < nIter; iter++ { 253 sliceStart := rand.Intn(maxSize) 254 sliceEnd := sliceStart + rand.Intn(maxSize-sliceStart) 255 main1Slice := main1Arr[sliceStart:sliceEnd] 256 main2Slice := main2Arr[sliceStart:sliceEnd] 257 main3Slice := main3Arr[sliceStart:sliceEnd] 258 src2Slice := src2Arr[sliceStart:sliceEnd] 259 for ii := range main1Slice { 260 main1Slice[ii] = uint16(rand.Uint32()) 261 } 262 copy(main2Slice, main1Slice) 263 copy(src2Slice, main1Slice) 264 sentinel := uint16(rand.Uint32()) 265 main2Arr[sliceEnd] = sentinel 266 main3Arr[sliceEnd] = sentinel 267 simd.ReverseU16(main3Slice, main1Slice) 268 reverseU16Slow(main1Slice) 269 simd.ReverseU16Inplace(main2Slice) 270 if !reflect.DeepEqual(main1Slice, main2Slice) { 271 t.Fatal("Mismatched ReverseU16Inplace result.") 272 } 273 if main2Arr[sliceEnd] != sentinel { 274 t.Fatal("ReverseU16Inplace clobbered an extra byte.") 275 } 276 if !reflect.DeepEqual(main1Slice, main3Slice) { 277 t.Fatal("Mismatched ReverseU16 result.") 278 } 279 if main3Arr[sliceEnd] != sentinel { 280 t.Fatal("ReverseU16 clobbered an extra byte.") 281 } 282 simd.ReverseU16Inplace(main2Slice) 283 if !reflect.DeepEqual(src2Slice, main2Slice) { 284 t.Fatal("ReverseU16Inplace didn't invert itself.") 285 } 286 } 287 } 288 289 /* 290 Benchmark results: 291 MacBook Pro (15-inch, 2016) 292 2.7 GHz Intel Core i7, 16 GB 2133 MHz LPDDR3 293 294 Benchmark_ReverseU16Inplace/SIMDShort1Cpu-8 20 102899505 ns/op 295 Benchmark_ReverseU16Inplace/SIMDShortHalfCpu-8 50 32918441 ns/op 296 Benchmark_ReverseU16Inplace/SIMDShortAllCpu-8 30 38848510 ns/op 297 Benchmark_ReverseU16Inplace/SIMDLong1Cpu-8 1 1116384992 ns/op 298 Benchmark_ReverseU16Inplace/SIMDLongHalfCpu-8 2 880730467 ns/op 299 Benchmark_ReverseU16Inplace/SIMDLongAllCpu-8 2 943204867 ns/op 300 Benchmark_ReverseU16Inplace/SlowShort1Cpu-8 3 443056373 ns/op 301 Benchmark_ReverseU16Inplace/SlowShortHalfCpu-8 10 117142962 ns/op 302 Benchmark_ReverseU16Inplace/SlowShortAllCpu-8 10 159087579 ns/op 303 Benchmark_ReverseU16Inplace/SlowLong1Cpu-8 1 3158497662 ns/op 304 Benchmark_ReverseU16Inplace/SlowLongHalfCpu-8 2 967619258 ns/op 305 Benchmark_ReverseU16Inplace/SlowLongAllCpu-8 2 978231337 ns/op 306 */ 307 308 func reverseU16InplaceSimdSubtask(args interface{}, nIter int) int { 309 a := args.(u16Args) 310 for iter := 0; iter < nIter; iter++ { 311 simd.ReverseU16Inplace(a.main) 312 } 313 return int(a.main[0]) 314 } 315 316 func reverseU16InplaceSlowSubtask(args interface{}, nIter int) int { 317 a := args.(u16Args) 318 for iter := 0; iter < nIter; iter++ { 319 reverseU16Slow(a.main) 320 } 321 return int(a.main[0]) 322 } 323 324 func Benchmark_ReverseU16Inplace(b *testing.B) { 325 funcs := []taggedMultiBenchVarargsFunc{ 326 { 327 f: reverseU16InplaceSimdSubtask, 328 tag: "SIMD", 329 }, 330 { 331 f: reverseU16InplaceSlowSubtask, 332 tag: "Slow", 333 }, 334 } 335 for _, f := range funcs { 336 multiBenchmarkVarargs(f.f, f.tag+"Short", 9999999, func() interface{} { 337 return u16Args{ 338 main: make([]uint16, 75, 75+31), 339 } 340 }, b) 341 multiBenchmarkVarargs(f.f, f.tag+"Long", 50, func() interface{} { 342 return u16Args{ 343 main: make([]uint16, 249250622/2, 249250622/2+31), 344 } 345 }, b) 346 } 347 }