github.com/grailbio/base@v0.0.11/simd/simd_test.go (about) 1 // Copyright 2018 GRAIL, Inc. All rights reserved. 2 // Use of this source code is governed by the Apache-2.0 3 // license that can be found in the LICENSE file. 4 5 package simd_test 6 7 import ( 8 "bytes" 9 "encoding/binary" 10 "math/rand" 11 "testing" 12 13 "github.com/grailbio/base/simd" 14 "github.com/grailbio/testutil/assert" 15 ) 16 17 // This is the most-frequently-recommended implementation. It's decent, so the 18 // suffix is 'Standard' instead of 'Slow'. 19 func memset8Standard(dst []byte, val byte) { 20 dstLen := len(dst) 21 if dstLen != 0 { 22 dst[0] = val 23 for i := 1; i < dstLen; { 24 i += copy(dst[i:], dst[:i]) 25 } 26 } 27 } 28 29 func TestMemset8(t *testing.T) { 30 maxSize := 500 31 nIter := 200 32 main1Arr := simd.MakeUnsafe(maxSize) 33 main2Arr := simd.MakeUnsafe(maxSize) 34 main3Arr := simd.MakeUnsafe(maxSize) 35 for iter := 0; iter < nIter; iter++ { 36 sliceStart := rand.Intn(maxSize) 37 sliceEnd := sliceStart + rand.Intn(maxSize-sliceStart) 38 main1Slice := main1Arr[sliceStart:sliceEnd] 39 main2Slice := main2Arr[sliceStart:sliceEnd] 40 main3Slice := main3Arr[sliceStart:sliceEnd] 41 byteVal := byte(rand.Intn(256)) 42 memset8Standard(main1Slice, byteVal) 43 simd.Memset8Unsafe(main2Slice, byteVal) 44 if !bytes.Equal(main1Slice, main2Slice) { 45 t.Fatal("Mismatched Memset8Unsafe result.") 46 } 47 sentinel := byte(rand.Intn(256)) 48 if len(main3Slice) > 0 { 49 main3Slice[0] = 0 50 } 51 main3Arr[sliceEnd] = sentinel 52 simd.Memset8(main3Slice, byteVal) 53 if !bytes.Equal(main1Slice, main3Slice) { 54 t.Fatal("Mismatched Memset8 result.") 55 } 56 if main3Arr[sliceEnd] != sentinel { 57 t.Fatal("Memset8 clobbered an extra byte.") 58 } 59 } 60 } 61 62 /* 63 Benchmark results: 64 MacBook Pro (15-inch, 2016) 65 2.7 GHz Intel Core i7, 16 GB 2133 MHz LPDDR3 66 67 Benchmark_Memset8/SIMDShort1Cpu-8 20 62706981 ns/op 68 Benchmark_Memset8/SIMDShortHalfCpu-8 100 17559573 ns/op 69 Benchmark_Memset8/SIMDShortAllCpu-8 100 17149982 ns/op 70 Benchmark_Memset8/SIMDLong1Cpu-8 1 1101524485 ns/op 71 Benchmark_Memset8/SIMDLongHalfCpu-8 2 925331938 ns/op 72 Benchmark_Memset8/SIMDLongAllCpu-8 2 971422170 ns/op 73 Benchmark_Memset8/StandardShort1Cpu-8 5 314689466 ns/op 74 Benchmark_Memset8/StandardShortHalfCpu-8 20 88260588 ns/op 75 Benchmark_Memset8/StandardShortAllCpu-8 20 84317546 ns/op 76 Benchmark_Memset8/StandardLong1Cpu-8 1 1082736141 ns/op 77 Benchmark_Memset8/StandardLongHalfCpu-8 2 992904776 ns/op 78 Benchmark_Memset8/StandardLongAllCpu-8 1 1052452033 ns/op 79 Benchmark_Memset8/RangeZeroShort1Cpu-8 30 44907924 ns/op 80 Benchmark_Memset8/RangeZeroShortHalfCpu-8 100 24173280 ns/op 81 Benchmark_Memset8/RangeZeroShortAllCpu-8 100 14991003 ns/op 82 Benchmark_Memset8/RangeZeroLong1Cpu-8 3 401003587 ns/op 83 Benchmark_Memset8/RangeZeroLongHalfCpu-8 3 400711072 ns/op 84 Benchmark_Memset8/RangeZeroLongAllCpu-8 3 404863223 ns/op 85 86 Notes: simd.Memset8 is broadly useful for short arrays, though usually a bit 87 worse than memclr. However, memclr wins handily in the 249 MB long case on the 88 test machine, thanks to AVX2 (and, in the AVX2 subroutine, cache-bypassing 89 stores). 90 When the simd.Memset8 AVX2 implementation is written, it should obviously 91 imitate what memclr is doing. 92 */ 93 94 func memset8SimdSubtask(dst, src []byte, nIter int) int { 95 for iter := 0; iter < nIter; iter++ { 96 simd.Memset8(dst, 78) 97 } 98 return int(dst[0]) 99 } 100 101 func memset8StandardSubtask(dst, src []byte, nIter int) int { 102 for iter := 0; iter < nIter; iter++ { 103 memset8Standard(dst, 78) 104 } 105 return int(dst[0]) 106 } 107 108 func memset8RangeZeroSubtask(dst, src []byte, nIter int) int { 109 for iter := 0; iter < nIter; iter++ { 110 // Compiler-recognized loop, which gets converted to a memclr call with 111 // fancier optimizations than simd.Memset8. 112 for pos := range dst { 113 dst[pos] = 0 114 } 115 } 116 return int(dst[0]) 117 } 118 119 func Benchmark_Memset8(b *testing.B) { 120 funcs := []taggedMultiBenchFunc{ 121 { 122 f: memset8SimdSubtask, 123 tag: "SIMD", 124 }, 125 { 126 f: memset8StandardSubtask, 127 tag: "Standard", 128 }, 129 { 130 f: memset8RangeZeroSubtask, 131 tag: "RangeZero", 132 }, 133 } 134 for _, f := range funcs { 135 // Base sequence in length-150 .bam read occupies 75 bytes, so 75 is a good 136 // size for the short-array benchmark. 137 multiBenchmark(f.f, f.tag+"Short", 75, 0, 9999999, b) 138 // GRCh37 chromosome 1 length is 249250621, so that's a plausible 139 // long-array use case. 140 multiBenchmark(f.f, f.tag+"Long", 249250621, 0, 50, b) 141 } 142 } 143 144 // This only matches UnpackedNibbleLookupInplace when all bytes < 128; the test 145 // has been restricted accordingly. _mm_shuffle_epi8()'s treatment of bytes >= 146 // 128 usually isn't relevant. 147 func unpackedNibbleLookupInplaceSlow(main []byte, tablePtr *simd.NibbleLookupTable) { 148 for idx := range main { 149 main[idx] = tablePtr.Get(main[idx] & 15) 150 } 151 } 152 153 func TestUnpackedNibbleLookup(t *testing.T) { 154 maxSize := 500 155 nIter := 200 156 main1Arr := simd.MakeUnsafe(maxSize) 157 main2Arr := simd.MakeUnsafe(maxSize) 158 main3Arr := simd.MakeUnsafe(maxSize) 159 main4Arr := simd.MakeUnsafe(maxSize) 160 main5Arr := simd.MakeUnsafe(maxSize) 161 table := simd.MakeNibbleLookupTable([16]byte{0, 1, 0, 2, 8, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0}) 162 for iter := 0; iter < nIter; iter++ { 163 sliceStart := rand.Intn(maxSize) 164 sliceEnd := sliceStart + rand.Intn(maxSize-sliceStart) 165 main1Slice := main1Arr[sliceStart:sliceEnd] 166 for ii := range main1Slice { 167 main1Slice[ii] = byte(rand.Intn(128)) 168 } 169 main2Slice := main2Arr[sliceStart:sliceEnd] 170 main3Slice := main3Arr[sliceStart:sliceEnd] 171 main4Slice := main4Arr[sliceStart:sliceEnd] 172 main5Slice := main5Arr[sliceStart:sliceEnd] 173 174 simd.UnpackedNibbleLookupUnsafe(main3Slice, main1Slice, &table) 175 176 sentinel := byte(rand.Intn(256)) 177 main4Arr[sliceEnd] = sentinel 178 simd.UnpackedNibbleLookup(main4Slice, main1Slice, &table) 179 180 copy(main2Slice, main1Slice) 181 copy(main5Slice, main1Slice) 182 183 unpackedNibbleLookupInplaceSlow(main1Slice, &table) 184 simd.UnpackedNibbleLookupUnsafeInplace(main2Slice, &table) 185 if !bytes.Equal(main1Slice, main2Slice) { 186 t.Fatal("Mismatched UnpackedNibbleLookupUnsafeInplace result.") 187 } 188 if !bytes.Equal(main1Slice, main3Slice) { 189 t.Fatal("Mismatched UnpackedNibbleLookupUnsafe result.") 190 } 191 if !bytes.Equal(main1Slice, main4Slice) { 192 t.Fatal("Mismatched UnpackedNibbleLookup result.") 193 } 194 if main4Arr[sliceEnd] != sentinel { 195 t.Fatal("UnpackedNibbleLookup clobbered an extra byte.") 196 } 197 198 main5Arr[sliceEnd] = sentinel 199 simd.UnpackedNibbleLookupInplace(main5Slice, &table) 200 if !bytes.Equal(main1Slice, main5Slice) { 201 t.Fatal("Mismatched UnpackedNibbleLookupInplace result.") 202 } 203 if main5Arr[sliceEnd] != sentinel { 204 t.Fatal("UnpackedNibbleLookupInplace clobbered an extra byte.") 205 } 206 } 207 } 208 209 /* 210 Benchmark results: 211 MacBook Pro (15-inch, 2016) 212 2.7 GHz Intel Core i7, 16 GB 2133 MHz LPDDR3 213 214 Benchmark_UnpackedNibbleLookupInplace/SIMDShort1Cpu-8 20 76720863 ns/op 215 Benchmark_UnpackedNibbleLookupInplace/SIMDShortHalfCpu-8 50 22968008 ns/op 216 Benchmark_UnpackedNibbleLookupInplace/SIMDShortAllCpu-8 100 18896633 ns/op 217 Benchmark_UnpackedNibbleLookupInplace/SIMDLong1Cpu-8 1 1046243684 ns/op 218 Benchmark_UnpackedNibbleLookupInplace/SIMDLongHalfCpu-8 2 861622838 ns/op 219 Benchmark_UnpackedNibbleLookupInplace/SIMDLongAllCpu-8 2 944384349 ns/op 220 Benchmark_UnpackedNibbleLookupInplace/SlowShort1Cpu-8 2 532267799 ns/op 221 Benchmark_UnpackedNibbleLookupInplace/SlowShortHalfCpu-8 10 144993320 ns/op 222 Benchmark_UnpackedNibbleLookupInplace/SlowShortAllCpu-8 10 146218387 ns/op 223 Benchmark_UnpackedNibbleLookupInplace/SlowLong1Cpu-8 1 7745668548 ns/op 224 Benchmark_UnpackedNibbleLookupInplace/SlowLongHalfCpu-8 1 2169127851 ns/op 225 Benchmark_UnpackedNibbleLookupInplace/SlowLongAllCpu-8 1 2164900359 ns/op 226 */ 227 228 func unpackedNibbleLookupInplaceSimdSubtask(dst, src []byte, nIter int) int { 229 table := simd.MakeNibbleLookupTable([16]byte{0, 1, 0, 2, 8, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0}) 230 for iter := 0; iter < nIter; iter++ { 231 // Note that this uses the result of one lookup operation as the input to 232 // the next. 233 // (Given the current table, all values should be 1 or 0 after 3 or more 234 // iterations.) 235 simd.UnpackedNibbleLookupInplace(dst, &table) 236 } 237 return int(dst[0]) 238 } 239 240 func unpackedNibbleLookupInplaceSlowSubtask(dst, src []byte, nIter int) int { 241 table := simd.MakeNibbleLookupTable([16]byte{0, 1, 0, 2, 8, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0}) 242 for iter := 0; iter < nIter; iter++ { 243 unpackedNibbleLookupInplaceSlow(dst, &table) 244 } 245 return int(dst[0]) 246 } 247 248 func Benchmark_UnpackedNibbleLookupInplace(b *testing.B) { 249 funcs := []taggedMultiBenchFunc{ 250 { 251 f: unpackedNibbleLookupInplaceSimdSubtask, 252 tag: "SIMD", 253 }, 254 { 255 f: unpackedNibbleLookupInplaceSlowSubtask, 256 tag: "Slow", 257 }, 258 } 259 for _, f := range funcs { 260 multiBenchmark(f.f, f.tag+"Short", 75, 0, 9999999, b) 261 multiBenchmark(f.f, f.tag+"Long", 249250621, 0, 50, b) 262 } 263 } 264 265 func packedNibbleLookupSlow(dst, src []byte, tablePtr *simd.NibbleLookupTable) { 266 dstLen := len(dst) 267 nSrcFullByte := dstLen / 2 268 srcOdd := dstLen & 1 269 for srcPos := 0; srcPos < nSrcFullByte; srcPos++ { 270 srcByte := src[srcPos] 271 dst[2*srcPos] = tablePtr.Get(srcByte & 15) 272 dst[2*srcPos+1] = tablePtr.Get(srcByte >> 4) 273 } 274 if srcOdd == 1 { 275 srcByte := src[nSrcFullByte] 276 dst[2*nSrcFullByte] = tablePtr.Get(srcByte & 15) 277 } 278 } 279 280 func TestPackedNibbleLookup(t *testing.T) { 281 maxDstSize := 500 282 maxSrcSize := (maxDstSize + 1) / 2 283 nIter := 200 284 srcArr := simd.MakeUnsafe(maxSrcSize) 285 dst1Arr := simd.MakeUnsafe(maxDstSize) 286 dst2Arr := simd.MakeUnsafe(maxDstSize) 287 table := simd.MakeNibbleLookupTable([16]byte{0, 1, 0, 2, 8, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0}) 288 for iter := 0; iter < nIter; iter++ { 289 srcSliceStart := rand.Intn(maxSrcSize) 290 dstSliceStart := srcSliceStart * 2 291 dstSliceEnd := dstSliceStart + rand.Intn(maxDstSize-dstSliceStart) 292 srcSliceEnd := (dstSliceEnd + 1) / 2 293 srcSlice := srcArr[srcSliceStart:srcSliceEnd] 294 for ii := range srcSlice { 295 srcSlice[ii] = byte(rand.Intn(256)) 296 } 297 dst1Slice := dst1Arr[dstSliceStart:dstSliceEnd] 298 dst2Slice := dst2Arr[dstSliceStart:dstSliceEnd] 299 packedNibbleLookupSlow(dst1Slice, srcSlice, &table) 300 simd.PackedNibbleLookupUnsafe(dst2Slice, srcSlice, &table) 301 if !bytes.Equal(dst1Slice, dst2Slice) { 302 t.Fatal("Mismatched PackedNibbleLookupUnsafe result.") 303 } 304 // ack, missed a PackedNibbleLookup bug: it didn't write some of the last 305 // few bytes in some cases, but that went undetected because the previous 306 // PackedNibbleLookupUnsafe call pre-filled those bytes correctly. 307 simd.Memset8Unsafe(dst2Arr, 0) 308 sentinel := byte(rand.Intn(256)) 309 dst2Arr[dstSliceEnd] = sentinel 310 simd.PackedNibbleLookup(dst2Slice, srcSlice, &table) 311 if !bytes.Equal(dst1Slice, dst2Slice) { 312 t.Fatal("Mismatched PackedNibbleLookup result.") 313 } 314 if dst2Arr[dstSliceEnd] != sentinel { 315 t.Fatal("PackedNibbleLookup clobbered an extra byte.") 316 } 317 } 318 } 319 320 /* 321 Benchmark results: 322 MacBook Pro (15-inch, 2016) 323 2.7 GHz Intel Core i7, 16 GB 2133 MHz LPDDR3 324 325 Benchmark_PackedNibbleLookup/UnsafeShort1Cpu-8 10 143501956 ns/op 326 Benchmark_PackedNibbleLookup/UnsafeShortHalfCpu-8 30 38748958 ns/op 327 Benchmark_PackedNibbleLookup/UnsafeShortAllCpu-8 50 31982398 ns/op 328 Benchmark_PackedNibbleLookup/UnsafeLong1Cpu-8 1 1372142640 ns/op 329 Benchmark_PackedNibbleLookup/UnsafeLongHalfCpu-8 1 1236198290 ns/op 330 Benchmark_PackedNibbleLookup/UnsafeLongAllCpu-8 1 1265315746 ns/op 331 Benchmark_PackedNibbleLookup/SIMDShort1Cpu-8 10 158155872 ns/op 332 Benchmark_PackedNibbleLookup/SIMDShortHalfCpu-8 30 43098347 ns/op 333 Benchmark_PackedNibbleLookup/SIMDShortAllCpu-8 30 37593692 ns/op 334 Benchmark_PackedNibbleLookup/SIMDLong1Cpu-8 1 1407559630 ns/op 335 Benchmark_PackedNibbleLookup/SIMDLongHalfCpu-8 1 1244569913 ns/op 336 Benchmark_PackedNibbleLookup/SIMDLongAllCpu-8 1 1245648867 ns/op 337 Benchmark_PackedNibbleLookup/SlowShort1Cpu-8 1 1322739228 ns/op 338 Benchmark_PackedNibbleLookup/SlowShortHalfCpu-8 3 381551545 ns/op 339 Benchmark_PackedNibbleLookup/SlowShortAllCpu-8 3 361846656 ns/op 340 Benchmark_PackedNibbleLookup/SlowLong1Cpu-8 1 9990188206 ns/op 341 Benchmark_PackedNibbleLookup/SlowLongHalfCpu-8 1 2855687759 ns/op 342 Benchmark_PackedNibbleLookup/SlowLongAllCpu-8 1 2877628266 ns/op 343 344 Notes: Unsafe version of this function is also benchmarked, since the 345 short-array safety penalty is a bit high here. This is mainly an indicator of 346 room for improvement in the safe function; I think it's clear at this point 347 that we'll probably never need to use the Unsafe interface. 348 */ 349 350 func packedNibbleLookupUnsafeSubtask(dst, src []byte, nIter int) int { 351 table := simd.MakeNibbleLookupTable([16]byte{0, 1, 0, 2, 8, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0}) 352 for iter := 0; iter < nIter; iter++ { 353 simd.PackedNibbleLookupUnsafe(dst, src, &table) 354 } 355 return int(dst[0]) 356 } 357 358 func packedNibbleLookupSimdSubtask(dst, src []byte, nIter int) int { 359 table := simd.MakeNibbleLookupTable([16]byte{0, 1, 0, 2, 8, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0}) 360 for iter := 0; iter < nIter; iter++ { 361 simd.PackedNibbleLookup(dst, src, &table) 362 } 363 return int(dst[0]) 364 } 365 366 func packedNibbleLookupSlowSubtask(dst, src []byte, nIter int) int { 367 table := simd.MakeNibbleLookupTable([16]byte{0, 1, 0, 2, 8, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0}) 368 for iter := 0; iter < nIter; iter++ { 369 packedNibbleLookupSlow(dst, src, &table) 370 } 371 return int(dst[0]) 372 } 373 374 func Benchmark_PackedNibbleLookup(b *testing.B) { 375 funcs := []taggedMultiBenchFunc{ 376 { 377 f: packedNibbleLookupUnsafeSubtask, 378 tag: "Unsafe", 379 }, 380 { 381 f: packedNibbleLookupSimdSubtask, 382 tag: "SIMD", 383 }, 384 { 385 f: packedNibbleLookupSlowSubtask, 386 tag: "Slow", 387 }, 388 } 389 for _, f := range funcs { 390 multiBenchmark(f.f, f.tag+"Short", 150, 75, 9999999, b) 391 multiBenchmark(f.f, f.tag+"Long", 249250621, 249250622/2, 50, b) 392 } 393 } 394 395 func interleaveSlow(dst, even, odd []byte) { 396 dstLen := len(dst) 397 evenLen := (dstLen + 1) >> 1 398 oddLen := dstLen >> 1 399 for idx, oddByte := range odd { 400 dst[2*idx] = even[idx] 401 dst[2*idx+1] = oddByte 402 } 403 if oddLen != evenLen { 404 dst[oddLen*2] = even[oddLen] 405 } 406 } 407 408 func TestInterleave(t *testing.T) { 409 maxSrcSize := 500 410 maxDstSize := 2 * maxSrcSize 411 nIter := 200 412 evenArr := simd.MakeUnsafe(maxSrcSize) 413 oddArr := simd.MakeUnsafe(maxSrcSize) 414 dst1Arr := simd.MakeUnsafe(maxDstSize) 415 dst2Arr := simd.MakeUnsafe(maxDstSize) 416 for iter := 0; iter < nIter; iter++ { 417 srcSliceStart := rand.Intn(maxSrcSize) 418 dstSliceStart := srcSliceStart * 2 419 dstSliceEnd := dstSliceStart + rand.Intn(maxDstSize-dstSliceStart) 420 evenSliceEnd := (dstSliceEnd + 1) >> 1 421 oddSliceEnd := dstSliceEnd >> 1 422 evenSlice := evenArr[srcSliceStart:evenSliceEnd] 423 oddSlice := oddArr[srcSliceStart:oddSliceEnd] 424 for ii := range evenSlice { 425 evenSlice[ii] = byte(rand.Intn(256)) 426 } 427 for ii := range oddSlice { 428 oddSlice[ii] = byte(rand.Intn(256)) 429 } 430 dst1Slice := dst1Arr[dstSliceStart:dstSliceEnd] 431 dst2Slice := dst2Arr[dstSliceStart:dstSliceEnd] 432 interleaveSlow(dst1Slice, evenSlice, oddSlice) 433 simd.Interleave8Unsafe(dst2Slice, evenSlice, oddSlice) 434 if !bytes.Equal(dst1Slice, dst2Slice) { 435 t.Fatal("Mismatched Interleave8Unsafe result.") 436 } 437 sentinel := byte(rand.Intn(256)) 438 dst2Arr[dstSliceEnd] = sentinel 439 simd.Interleave8(dst2Slice, evenSlice, oddSlice) 440 if !bytes.Equal(dst1Slice, dst2Slice) { 441 t.Fatal("Mismatched Interleave8 result.") 442 } 443 if dst2Arr[dstSliceEnd] != sentinel { 444 t.Fatal("Interleave8 clobbered an extra byte.") 445 } 446 } 447 } 448 449 /* 450 Benchmark results: 451 MacBook Pro (15-inch, 2016) 452 2.7 GHz Intel Core i7, 16 GB 2133 MHz LPDDR3 453 454 Benchmark_Interleave/UnsafeShort1Cpu-8 10 124397567 ns/op 455 Benchmark_Interleave/UnsafeShortHalfCpu-8 50 33427370 ns/op 456 Benchmark_Interleave/UnsafeShortAllCpu-8 50 27522495 ns/op 457 Benchmark_Interleave/UnsafeLong1Cpu-8 1 1364788736 ns/op 458 Benchmark_Interleave/UnsafeLongHalfCpu-8 1 1194034677 ns/op 459 Benchmark_Interleave/UnsafeLongAllCpu-8 1 1240540994 ns/op 460 Benchmark_Interleave/SIMDShort1Cpu-8 10 143574503 ns/op 461 Benchmark_Interleave/SIMDShortHalfCpu-8 30 40429942 ns/op 462 Benchmark_Interleave/SIMDShortAllCpu-8 50 30500450 ns/op 463 Benchmark_Interleave/SIMDLong1Cpu-8 1 1281952758 ns/op 464 Benchmark_Interleave/SIMDLongHalfCpu-8 1 1210134670 ns/op 465 Benchmark_Interleave/SIMDLongAllCpu-8 1 1284786977 ns/op 466 Benchmark_Interleave/SlowShort1Cpu-8 2 880545817 ns/op 467 Benchmark_Interleave/SlowShortHalfCpu-8 5 234673823 ns/op 468 Benchmark_Interleave/SlowShortAllCpu-8 5 230332535 ns/op 469 Benchmark_Interleave/SlowLong1Cpu-8 1 6669283712 ns/op 470 Benchmark_Interleave/SlowLongHalfCpu-8 1 1860713287 ns/op 471 Benchmark_Interleave/SlowLongAllCpu-8 1 1807886977 ns/op 472 */ 473 474 func interleaveUnsafeSubtask(dst, src []byte, nIter int) int { 475 for iter := 0; iter < nIter; iter++ { 476 simd.Interleave8Unsafe(dst, src, src) 477 } 478 return int(dst[0]) 479 } 480 481 func interleaveSimdSubtask(dst, src []byte, nIter int) int { 482 for iter := 0; iter < nIter; iter++ { 483 simd.Interleave8(dst, src, src) 484 } 485 return int(dst[0]) 486 } 487 488 func interleaveSlowSubtask(dst, src []byte, nIter int) int { 489 for iter := 0; iter < nIter; iter++ { 490 interleaveSlow(dst, src, src) 491 } 492 return int(dst[0]) 493 } 494 495 func Benchmark_Interleave(b *testing.B) { 496 funcs := []taggedMultiBenchFunc{ 497 { 498 f: interleaveUnsafeSubtask, 499 tag: "Unsafe", 500 }, 501 { 502 f: interleaveSimdSubtask, 503 tag: "SIMD", 504 }, 505 { 506 f: interleaveSlowSubtask, 507 tag: "Slow", 508 }, 509 } 510 for _, f := range funcs { 511 multiBenchmark(f.f, f.tag+"Short", 150, 75, 9999999, b) 512 multiBenchmark(f.f, f.tag+"Long", 124625311*2, 124625311, 50, b) 513 } 514 } 515 516 func reverse8Slow(main []byte) { 517 nByte := len(main) 518 nByteDiv2 := nByte >> 1 519 for idx, invIdx := 0, nByte-1; idx != nByteDiv2; idx, invIdx = idx+1, invIdx-1 { 520 main[idx], main[invIdx] = main[invIdx], main[idx] 521 } 522 } 523 524 func TestReverse8(t *testing.T) { 525 maxSize := 500 526 nIter := 200 527 main1Arr := simd.MakeUnsafe(maxSize) 528 main2Arr := simd.MakeUnsafe(maxSize) 529 main3Arr := simd.MakeUnsafe(maxSize) 530 main4Arr := simd.MakeUnsafe(maxSize) 531 src2Arr := simd.MakeUnsafe(maxSize) 532 for iter := 0; iter < nIter; iter++ { 533 sliceStart := rand.Intn(maxSize) 534 sliceEnd := sliceStart + rand.Intn(maxSize-sliceStart) 535 main1Slice := main1Arr[sliceStart:sliceEnd] 536 main2Slice := main2Arr[sliceStart:sliceEnd] 537 main3Slice := main3Arr[sliceStart:sliceEnd] 538 main4Slice := main4Arr[sliceStart:sliceEnd] 539 src2Slice := src2Arr[sliceStart:sliceEnd] 540 for ii := range main1Slice { 541 main1Slice[ii] = byte(rand.Intn(256)) 542 } 543 copy(main2Slice, main1Slice) 544 copy(src2Slice, main1Slice) 545 sentinel := byte(rand.Intn(256)) 546 main2Arr[sliceEnd] = sentinel 547 main4Arr[sliceEnd] = sentinel 548 simd.Reverse8Unsafe(main3Slice, main1Slice) 549 simd.Reverse8(main4Slice, main1Slice) 550 reverse8Slow(main1Slice) 551 simd.Reverse8Inplace(main2Slice) 552 if !bytes.Equal(main1Slice, main2Slice) { 553 t.Fatal("Mismatched Reverse8Inplace result.") 554 } 555 if main2Arr[sliceEnd] != sentinel { 556 t.Fatal("Reverse8Inplace clobbered an extra byte.") 557 } 558 if !bytes.Equal(main1Slice, main3Slice) { 559 t.Fatal("Mismatched Reverse8Unsafe result.") 560 } 561 if !bytes.Equal(main1Slice, main4Slice) { 562 t.Fatal("Mismatched Reverse8 result.") 563 } 564 if main4Arr[sliceEnd] != sentinel { 565 t.Fatal("Reverse8 clobbered an extra byte.") 566 } 567 simd.Reverse8Inplace(main4Slice) 568 if !bytes.Equal(src2Slice, main4Slice) { 569 t.Fatal("Reverse8Inplace didn't invert itself.") 570 } 571 } 572 } 573 574 /* 575 Benchmark results: 576 MacBook Pro (15-inch, 2016) 577 2.7 GHz Intel Core i7, 16 GB 2133 MHz LPDDR3 578 579 Benchmark_Reverse8Inplace/SIMDShort1Cpu-8 20 67121510 ns/op 580 Benchmark_Reverse8Inplace/SIMDShortHalfCpu-8 100 18891965 ns/op 581 Benchmark_Reverse8Inplace/SIMDShortAllCpu-8 100 16177224 ns/op 582 Benchmark_Reverse8Inplace/SIMDLong1Cpu-8 1 1115497033 ns/op 583 Benchmark_Reverse8Inplace/SIMDLongHalfCpu-8 2 885764257 ns/op 584 Benchmark_Reverse8Inplace/SIMDLongAllCpu-8 2 941948715 ns/op 585 Benchmark_Reverse8Inplace/SlowShort1Cpu-8 3 398662666 ns/op 586 Benchmark_Reverse8Inplace/SlowShortHalfCpu-8 10 105618119 ns/op 587 Benchmark_Reverse8Inplace/SlowShortAllCpu-8 10 184808267 ns/op 588 Benchmark_Reverse8Inplace/SlowLong1Cpu-8 1 5665556658 ns/op 589 Benchmark_Reverse8Inplace/SlowLongHalfCpu-8 1 1597487158 ns/op 590 Benchmark_Reverse8Inplace/SlowLongAllCpu-8 1 1616963854 ns/op 591 */ 592 593 func reverse8InplaceSimdSubtask(dst, src []byte, nIter int) int { 594 for iter := 0; iter < nIter; iter++ { 595 simd.Reverse8Inplace(dst) 596 } 597 return int(dst[0]) 598 } 599 600 func reverse8InplaceSlowSubtask(dst, src []byte, nIter int) int { 601 for iter := 0; iter < nIter; iter++ { 602 reverse8Slow(dst) 603 } 604 return int(dst[0]) 605 } 606 607 func Benchmark_Reverse8Inplace(b *testing.B) { 608 funcs := []taggedMultiBenchFunc{ 609 { 610 f: reverse8InplaceSimdSubtask, 611 tag: "SIMD", 612 }, 613 { 614 f: reverse8InplaceSlowSubtask, 615 tag: "Slow", 616 }, 617 } 618 for _, f := range funcs { 619 multiBenchmark(f.f, f.tag+"Short", 75, 0, 9999999, b) 620 multiBenchmark(f.f, f.tag+"Long", 249250621, 0, 50, b) 621 } 622 } 623 624 func bitFromEveryByteSlow(dst, src []byte, bitIdx int) { 625 requiredDstLen := (len(src) + 7) >> 3 626 if (len(dst) < requiredDstLen) || (uint(bitIdx) > 7) { 627 panic("BitFromEveryByte requires len(dst) >= (len(src) + 7) / 8 and 0 <= bitIdx < 8.") 628 } 629 dst = dst[:requiredDstLen] 630 for i := range dst { 631 dst[i] = 0 632 } 633 for i, b := range src { 634 dst[i>>3] |= ((b >> uint32(bitIdx)) & 1) << uint32(i&7) 635 } 636 } 637 638 func bitFromEveryByteFancyNoasm(dst, src []byte, bitIdx int) { 639 requiredDstLen := (len(src) + 7) >> 3 640 if (len(dst) < requiredDstLen) || (uint(bitIdx) > 7) { 641 panic("BitFromEveryByte requires len(dst) >= (len(src) + 7) / 8 and 0 <= bitIdx < 8.") 642 } 643 nSrcFullWord := len(src) >> 3 644 for i := 0; i < nSrcFullWord; i++ { 645 // Tried using a unsafeBytesToWords function on src in place of 646 // binary.LittleEndian.Uint64, and it barely made any difference. 647 srcWord := binary.LittleEndian.Uint64(src[i*8:i*8+8]) >> uint32(bitIdx) 648 649 srcWord &= 0x101010101010101 650 651 // Before this operation, the bits of interest are at positions 0, 8, 16, 652 // 24, 32, 40, 48, and 56 in srcWord, and all other bits are guaranteed to 653 // be zero. 654 // 655 // Suppose the bit at position 16 is set, and no other bits are set. What 656 // does multiplication by the magic number 0x102040810204080 accomplish? 657 // Well, the magic number has bits set at positions 7, 14, 21, 28, 35, 42, 658 // 49, and 56. Multiplying by 2^16 is equivalent to left-shifting by 16, 659 // so the product has bits set at positions (7+16), (14+16), (21+16), 660 // (28+16), (35+16), (42+16), and the last two overflow off the top end. 661 // 662 // Now suppose the bits at position 0 and 16 are both set. The result is 663 // then the sum of (2^0) * <magic number> + (2^16) * <magic number>. The 664 // first term in this sum has bits set at positions 7, 14, ..., 56. 665 // Critically, *none of these bits overlap with the second term*, so there 666 // are no 'carries' when we add the two terms together. So the final 667 // product has bits set at positions 7, 14, 21, 23, 28, 30, 35, 37, 42, 44, 668 // 49, 51, 56, and 58. 669 // 670 // It turns out that none of the bits in any of the 8 terms of this product 671 // have overlapping positions. So the multiplication operation just makes 672 // a bunch of left-shifted copies of the original bits... and in 673 // particular, bits 56-63 of the product are: 674 // 56: original bit 0, left-shifted 56 675 // 57: original bit 8, left-shifted 49 676 // 58: original bit 16, left-shifted 42 677 // 59: original bit 24, left-shifted 35 678 // 60: original bit 32, left-shifted 28 679 // 61: original bit 40, left-shifted 21 680 // 62: original bit 48, left-shifted 14 681 // 63: original bit 56, left-shifted 7 682 // Thus, right-shifting the product by 56 gives us the byte we want. 683 // 684 // This is a very esoteric algorithm, and it doesn't have much direct 685 // application because all 64-bit x86 processors provide an assembly 686 // instruction which lets you do this >6 times as quickly. Occasionally 687 // the idea of using multiplication to create staggered left-shifted copies 688 // of bits does genuinely come in handy, though. 689 dst[i] = byte((srcWord * 0x102040810204080) >> 56) 690 } 691 if nSrcFullWord != requiredDstLen { 692 srcLast := src[nSrcFullWord*8:] 693 dstLast := dst[nSrcFullWord:requiredDstLen] 694 for i := range dstLast { 695 dstLast[i] = 0 696 } 697 for i, b := range srcLast { 698 dstLast[i>>3] |= ((b >> uint32(bitIdx)) & 1) << uint32(i&7) 699 } 700 } 701 } 702 703 func TestBitFromEveryByte(t *testing.T) { 704 maxSize := 500 705 nIter := 200 706 rand.Seed(1) 707 srcArr := make([]byte, maxSize) 708 dstArr1 := make([]byte, maxSize) 709 dstArr2 := make([]byte, maxSize) 710 dstArr3 := make([]byte, maxSize) 711 for iter := 0; iter < nIter; iter++ { 712 sliceStart := rand.Intn(maxSize) 713 srcSize := rand.Intn(maxSize - sliceStart) 714 srcSliceEnd := sliceStart + srcSize 715 srcSlice := srcArr[sliceStart:srcSliceEnd] 716 717 minDstSize := (srcSize + 7) >> 3 718 dstSliceEnd := sliceStart + minDstSize 719 dstSlice1 := dstArr1[sliceStart:dstSliceEnd] 720 dstSlice2 := dstArr2[sliceStart:dstSliceEnd] 721 dstSlice3 := dstArr3[sliceStart:dstSliceEnd] 722 723 for ii := range srcSlice { 724 srcSlice[ii] = byte(rand.Intn(256)) 725 } 726 sentinel := byte(rand.Intn(256)) 727 dstArr2[dstSliceEnd] = sentinel 728 729 bitIdx := rand.Intn(8) 730 bitFromEveryByteSlow(dstSlice1, srcSlice, bitIdx) 731 simd.BitFromEveryByte(dstSlice2, srcSlice, bitIdx) 732 assert.EQ(t, dstSlice1, dstSlice2) 733 assert.EQ(t, sentinel, dstArr2[dstSliceEnd]) 734 735 // Also validate the assembly-free multiplication-based algorithm. 736 sentinel = byte(rand.Intn(256)) 737 dstArr3[dstSliceEnd] = sentinel 738 bitFromEveryByteFancyNoasm(dstSlice3, srcSlice, bitIdx) 739 assert.EQ(t, dstSlice1, dstSlice3) 740 assert.EQ(t, sentinel, dstArr3[dstSliceEnd]) 741 } 742 } 743 744 /* 745 Benchmark results: 746 MacBook Pro (15-inch, 2016) 747 2.7 GHz Intel Core i7, 16 GB 2133 MHz LPDDR3 748 749 Benchmark_BitFromEveryByte/SIMDLong1Cpu-8 200 6861450 ns/op 750 Benchmark_BitFromEveryByte/SIMDLongHalfCpu-8 200 7360937 ns/op 751 Benchmark_BitFromEveryByte/SIMDLongAllCpu-8 200 8846261 ns/op 752 Benchmark_BitFromEveryByte/FancyNoasmLong1Cpu-8 20 58756902 ns/op 753 Benchmark_BitFromEveryByte/FancyNoasmLongHalfCpu-8 100 17244847 ns/op 754 Benchmark_BitFromEveryByte/FancyNoasmLongAllCpu-8 100 16624282 ns/op 755 Benchmark_BitFromEveryByte/SlowLong1Cpu-8 3 422073091 ns/op 756 Benchmark_BitFromEveryByte/SlowLongHalfCpu-8 10 117732813 ns/op 757 Benchmark_BitFromEveryByte/SlowLongAllCpu-8 10 114903556 ns/op 758 759 Notes: 1Cpu has higher throughput than HalfCpu/AllCpu on this test machine due 760 to L3 cache saturation: multiBenchmarkDstSrc makes each goroutine process its 761 own ~4 MB job, rather than splitting a single job into smaller pieces, and a 762 15-inch 2016 MacBook Pro has a 8 MB L3 cache. If you shrink the test size to 763 len(src)=400000, HalfCpu outperforms 1Cpu by the expected amount. 764 765 I'm leaving this unusual benchmark result here since (i) it corresponds to how 766 we actually need to use the function, and (ii) this phenomenon is definitely 767 worth knowing about. 768 */ 769 770 func bitFromEveryByteSimdSubtask(dst, src []byte, nIter int) int { 771 for iter := 0; iter < nIter; iter++ { 772 simd.BitFromEveryByte(dst, src, 0) 773 } 774 return int(dst[0]) 775 } 776 777 func bitFromEveryByteFancyNoasmSubtask(dst, src []byte, nIter int) int { 778 for iter := 0; iter < nIter; iter++ { 779 bitFromEveryByteFancyNoasm(dst, src, 0) 780 } 781 return int(dst[0]) 782 } 783 784 func bitFromEveryByteSlowSubtask(dst, src []byte, nIter int) int { 785 for iter := 0; iter < nIter; iter++ { 786 bitFromEveryByteSlow(dst, src, 0) 787 } 788 return int(dst[0]) 789 } 790 791 func Benchmark_BitFromEveryByte(b *testing.B) { 792 funcs := []taggedMultiBenchFunc{ 793 { 794 f: bitFromEveryByteSimdSubtask, 795 tag: "SIMD", 796 }, 797 { 798 f: bitFromEveryByteFancyNoasmSubtask, 799 tag: "FancyNoasm", 800 }, 801 { 802 f: bitFromEveryByteSlowSubtask, 803 tag: "Slow", 804 }, 805 } 806 for _, f := range funcs { 807 multiBenchmark(f.f, f.tag+"Long", 4091904/8, 4091904, 50, b) 808 } 809 }