github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/simd/count_test.go (about) 1 // Copyright 2021 GRAIL, Inc. All rights reserved. 2 // Use of this source code is governed by the Apache-2.0 3 // license that can be found in the LICENSE file. 4 5 package simd_test 6 7 import ( 8 "bytes" 9 "math/bits" 10 "math/rand" 11 "reflect" 12 "testing" 13 "unsafe" 14 15 "github.com/Schaudge/grailbase/simd" 16 ) 17 18 func init() { 19 if unsafe.Sizeof(uintptr(0)) != 8 { 20 // popcnt_amd64.go shouldn't compile at all in this case, but just in 21 // case... 22 panic("8-byte words required.") 23 } 24 } 25 26 func popcntBytesNoasm(byteslice []byte) int { 27 bytesliceHeader := (*reflect.SliceHeader)(unsafe.Pointer(&byteslice)) 28 ct := uintptr(len(byteslice)) 29 30 bytearr := unsafe.Pointer(bytesliceHeader.Data) 31 endptr := unsafe.Add(bytearr, ct) 32 tot := 0 33 nLeadingByte := ct % 8 34 if nLeadingByte != 0 { 35 leadingWord := uint64(0) 36 if (nLeadingByte & 1) != 0 { 37 leadingWord = (uint64)(*(*byte)(bytearr)) 38 bytearr = unsafe.Add(bytearr, 1) 39 } 40 if (nLeadingByte & 2) != 0 { 41 leadingWord <<= 16 42 leadingWord |= (uint64)(*(*uint16)(bytearr)) 43 bytearr = unsafe.Add(bytearr, 2) 44 } 45 if (nLeadingByte & 4) != 0 { 46 leadingWord <<= 32 47 leadingWord |= (uint64)(*(*uint32)(bytearr)) 48 bytearr = unsafe.Add(bytearr, 4) 49 } 50 tot = bits.OnesCount64(leadingWord) 51 } 52 // Strangely, performance of this loop seems to vary by ~20% on my Mac, 53 // depending on which of several equivalent ways I use to write it. 54 for bytearr != endptr { 55 tot += bits.OnesCount64((uint64)(*((*uint64)(bytearr)))) 56 bytearr = unsafe.Add(bytearr, 8) 57 } 58 return tot 59 } 60 61 func popcntBytesSlow(bytes []byte) int { 62 // Slow (factor of 5-8x), but straightforward-to-verify implementation. 63 tot := 0 64 for _, b := range bytes { 65 tot += bits.OnesCount8(b) 66 } 67 return tot 68 } 69 70 func TestBytePopcnt(t *testing.T) { 71 // Generate a random string, then popcount 20000 random slices with lengths 72 // in [0, 5000). 73 maxSize := 5000 74 nIter := 20000 75 byteArr := make([]byte, 2*maxSize) 76 for i := range byteArr { 77 byteArr[i] = byte(rand.Intn(256)) 78 } 79 for iter := 0; iter < nIter; iter++ { 80 sliceStart := rand.Intn(maxSize) 81 sliceEnd := sliceStart + rand.Intn(maxSize) 82 curSlice := byteArr[sliceStart:sliceEnd] 83 sum1 := simd.Popcnt(curSlice) 84 sum2 := popcntBytesNoasm(curSlice) 85 if sum1 != sum2 { 86 t.Fatal("Mismatched popcounts (noasm).") 87 } 88 } 89 nVerifyIter := 1000 90 for iter := 0; iter < nVerifyIter; iter++ { 91 sliceStart := rand.Intn(maxSize) 92 sliceEnd := sliceStart + rand.Intn(maxSize) 93 curSlice := byteArr[sliceStart:sliceEnd] 94 sum1 := simd.Popcnt(curSlice) 95 sum2 := popcntBytesSlow(curSlice) 96 if sum1 != sum2 { 97 t.Fatal("Mismatched popcounts (slow).") 98 } 99 } 100 } 101 102 /* 103 Benchmark results: 104 MacBook Pro (15-inch, 2016) 105 2.7 GHz Intel Core i7, 16 GB 2133 MHz LPDDR3 106 107 Benchmark_Popcnt/SIMDShort1Cpu-8 20 90993141 ns/op 108 Benchmark_Popcnt/SIMDShortHalfCpu-8 50 24639468 ns/op 109 Benchmark_Popcnt/SIMDShortAllCpu-8 100 23098747 ns/op 110 Benchmark_Popcnt/SIMDLong1Cpu-8 2 909927976 ns/op 111 Benchmark_Popcnt/SIMDLongHalfCpu-8 3 488961048 ns/op 112 Benchmark_Popcnt/SIMDLongAllCpu-8 3 466249901 ns/op 113 Benchmark_Popcnt/NoasmShort1Cpu-8 10 106873386 ns/op 114 Benchmark_Popcnt/NoasmShortHalfCpu-8 50 29290668 ns/op 115 Benchmark_Popcnt/NoasmShortAllCpu-8 50 29559455 ns/op 116 Benchmark_Popcnt/NoasmLong1Cpu-8 1 1217844097 ns/op 117 Benchmark_Popcnt/NoasmLongHalfCpu-8 2 507946501 ns/op 118 Benchmark_Popcnt/NoasmLongAllCpu-8 3 483458386 ns/op 119 Benchmark_Popcnt/SlowShort1Cpu-8 2 519449562 ns/op 120 Benchmark_Popcnt/SlowShortHalfCpu-8 10 139108095 ns/op 121 Benchmark_Popcnt/SlowShortAllCpu-8 10 143346876 ns/op 122 Benchmark_Popcnt/SlowLong1Cpu-8 1 7515831696 ns/op 123 Benchmark_Popcnt/SlowLongHalfCpu-8 1 2083880380 ns/op 124 Benchmark_Popcnt/SlowLongAllCpu-8 1 2064129411 ns/op 125 126 Notes: The current SSE4.2 SIMD implementation just amounts to a 2x-unrolled 127 OnesCount64 loop without flag-rechecking overhead; they're using the same 128 underlying instruction. AVX2/AVX-512 allow for faster bulk processing, though; 129 see e.g. https://github.com/kimwalisch/libpopcnt . 130 */ 131 132 func popcntSimdSubtask(dst, src []byte, nIter int) int { 133 sum := 0 134 for iter := 0; iter < nIter; iter++ { 135 sum += simd.Popcnt(src) 136 } 137 return sum 138 } 139 140 func popcntNoasmSubtask(dst, src []byte, nIter int) int { 141 sum := 0 142 for iter := 0; iter < nIter; iter++ { 143 sum += popcntBytesNoasm(src) 144 } 145 return sum 146 } 147 148 func popcntSlowSubtask(dst, src []byte, nIter int) int { 149 sum := 0 150 for iter := 0; iter < nIter; iter++ { 151 sum += popcntBytesSlow(src) 152 } 153 return sum 154 } 155 156 func Benchmark_Popcnt(b *testing.B) { 157 funcs := []taggedMultiBenchFunc{ 158 { 159 f: popcntSimdSubtask, 160 tag: "SIMD", 161 }, 162 { 163 f: popcntNoasmSubtask, 164 tag: "Noasm", 165 }, 166 { 167 f: popcntSlowSubtask, 168 tag: "Slow", 169 }, 170 } 171 for _, f := range funcs { 172 multiBenchmark(f.f, f.tag+"Short", 0, 75, 9999999, b) 173 multiBenchmark(f.f, f.tag+"Long", 0, 249250621, 50, b) 174 } 175 } 176 177 var cgArr = [...]byte{'C', 'G'} 178 179 func countCGStandard(src []byte) int { 180 return bytes.Count(src, cgArr[:1]) + bytes.Count(src, cgArr[1:2]) 181 } 182 183 func countCGNaive(src []byte) int { 184 cnt := 0 185 for _, srcByte := range src { 186 // Note that (srcByte & 0xfb) == 'C' takes ~30% less time than this. 187 if srcByte == 'C' || srcByte == 'G' { 188 cnt++ 189 } 190 } 191 return cnt 192 } 193 194 func TestCountCG(t *testing.T) { 195 maxSize := 10000 196 nIter := 200 197 srcArr := simd.MakeUnsafe(maxSize) 198 for iter := 0; iter < nIter; iter++ { 199 sliceStart := rand.Intn(maxSize) 200 sliceEnd := sliceStart + rand.Intn(maxSize-sliceStart) 201 srcSlice := srcArr[sliceStart:sliceEnd] 202 for ii := range srcSlice { 203 srcSlice[ii] = byte(rand.Intn(256)) 204 } 205 result1 := countCGStandard(srcSlice) 206 result2 := simd.MaskThenCountByte(srcSlice, 0xfb, 'C') 207 if result1 != result2 { 208 t.Fatal("Mismatched MaskThenCountByte result.") 209 } 210 result2 = countCGNaive(srcSlice) 211 if result1 != result2 { 212 t.Fatal("Mismatched countCGStandard/countCGNaive results.") 213 } 214 } 215 } 216 217 /* 218 Benchmark results: 219 MacBook Pro (15-inch, 2016) 220 2.7 GHz Intel Core i7, 16 GB 2133 MHz LPDDR3 221 222 Benchmark_CountCG/SIMDShort1Cpu-8 10 119280079 ns/op 223 Benchmark_CountCG/SIMDShortHalfCpu-8 50 34743805 ns/op 224 Benchmark_CountCG/SIMDShortAllCpu-8 50 28507338 ns/op 225 Benchmark_CountCG/SIMDLong1Cpu-8 2 765099599 ns/op 226 Benchmark_CountCG/SIMDLongHalfCpu-8 3 491655239 ns/op 227 Benchmark_CountCG/SIMDLongAllCpu-8 3 452592924 ns/op 228 Benchmark_CountCG/StandardShort1Cpu-8 5 237081120 ns/op 229 Benchmark_CountCG/StandardShortHalfCpu-8 20 64949969 ns/op 230 Benchmark_CountCG/StandardShortAllCpu-8 20 59167932 ns/op 231 Benchmark_CountCG/StandardLong1Cpu-8 1 1496389230 ns/op 232 Benchmark_CountCG/StandardLongHalfCpu-8 2 931898463 ns/op 233 Benchmark_CountCG/StandardLongAllCpu-8 2 980615182 ns/op 234 */ 235 236 func countCGSimdSubtask(dst, src []byte, nIter int) int { 237 tot := 0 238 for iter := 0; iter < nIter; iter++ { 239 tot += simd.MaskThenCountByte(src, 0xfb, 'C') 240 } 241 return tot 242 } 243 244 func countCGStandardSubtask(dst, src []byte, nIter int) int { 245 tot := 0 246 for iter := 0; iter < nIter; iter++ { 247 tot += countCGStandard(src) 248 } 249 return tot 250 } 251 252 func Benchmark_CountCG(b *testing.B) { 253 funcs := []taggedMultiBenchFunc{ 254 { 255 f: countCGSimdSubtask, 256 tag: "SIMD", 257 }, 258 { 259 f: countCGStandardSubtask, 260 tag: "Standard", 261 }, 262 } 263 for _, f := range funcs { 264 multiBenchmark(f.f, f.tag+"Short", 0, 150, 9999999, b) 265 multiBenchmark(f.f, f.tag+"Long", 0, 249250621, 50, b) 266 } 267 } 268 269 func count2BytesStandard(src, vals []byte) int { 270 // Not 'Slow' since bytes.Count is decently optimized for a single byte. 271 return bytes.Count(src, vals[:1]) + bytes.Count(src, vals[1:2]) 272 } 273 274 func TestCount2Bytes(t *testing.T) { 275 maxSize := 10000 276 nIter := 200 277 srcArr := simd.MakeUnsafe(maxSize) 278 vals := make([]byte, 2) 279 for iter := 0; iter < nIter; iter++ { 280 sliceStart := rand.Intn(maxSize) 281 sliceEnd := sliceStart + rand.Intn(maxSize-sliceStart) 282 // sliceEnd := sliceStart + rand.Intn(maxSize-sliceStart)&^15 283 srcSlice := srcArr[sliceStart:sliceEnd] 284 for ii := range srcSlice { 285 srcSlice[ii] = byte(rand.Intn(256)) 286 } 287 val1 := byte(rand.Intn(256)) 288 val2 := val1 + 1 289 vals[0] = val1 290 vals[1] = val2 291 result1 := count2BytesStandard(srcSlice, vals) 292 result2 := simd.Count2Bytes(srcSlice, val1, val2) 293 if result1 != result2 { 294 t.Fatal("Mismatched Count2Bytes result.") 295 } 296 } 297 } 298 299 func count3BytesStandard(src, vals []byte) int { 300 return bytes.Count(src, vals[:1]) + bytes.Count(src, vals[1:2]) + bytes.Count(src, vals[2:3]) 301 } 302 303 func TestCount3Bytes(t *testing.T) { 304 maxSize := 10000 305 nIter := 200 306 srcArr := simd.MakeUnsafe(maxSize) 307 vals := make([]byte, 3) 308 for iter := 0; iter < nIter; iter++ { 309 sliceStart := rand.Intn(maxSize) 310 sliceEnd := sliceStart + rand.Intn(maxSize-sliceStart) 311 srcSlice := srcArr[sliceStart:sliceEnd] 312 for ii := range srcSlice { 313 srcSlice[ii] = byte(rand.Intn(256)) 314 } 315 val1 := byte(rand.Intn(256)) 316 val2 := val1 + 1 317 val3 := val1 + 2 318 vals[0] = val1 319 vals[1] = val2 320 vals[2] = val3 321 result1 := count3BytesStandard(srcSlice, vals) 322 result2 := simd.Count3Bytes(srcSlice, val1, val2, val3) 323 if result1 != result2 { 324 t.Fatal("Mismatched Count3Bytes result.") 325 } 326 } 327 } 328 329 /* 330 Benchmark results: 331 MacBook Pro (15-inch, 2016) 332 2.7 GHz Intel Core i7, 16 GB 2133 MHz LPDDR3 333 334 Benchmark_Count3Bytes/SIMDShort1Cpu-8 10 141085860 ns/op 335 Benchmark_Count3Bytes/SIMDShortHalfCpu-8 30 40371892 ns/op 336 Benchmark_Count3Bytes/SIMDShortAllCpu-8 30 37769995 ns/op 337 Benchmark_Count3Bytes/SIMDLong1Cpu-8 2 945534510 ns/op 338 Benchmark_Count3Bytes/SIMDLongHalfCpu-8 3 499146889 ns/op 339 Benchmark_Count3Bytes/SIMDLongAllCpu-8 3 475811932 ns/op 340 Benchmark_Count3Bytes/StandardShort1Cpu-8 3 346637595 ns/op 341 Benchmark_Count3Bytes/StandardShortHalfCpu-8 20 96524251 ns/op 342 Benchmark_Count3Bytes/StandardShortAllCpu-8 20 87056185 ns/op 343 Benchmark_Count3Bytes/StandardLong1Cpu-8 1 2260954596 ns/op 344 Benchmark_Count3Bytes/StandardLongHalfCpu-8 1 1518757560 ns/op 345 Benchmark_Count3Bytes/StandardLongAllCpu-8 1 1468352229 ns/op 346 */ 347 348 func count3BytesSimdSubtask(dst, src []byte, nIter int) int { 349 tot := 0 350 for iter := 0; iter < nIter; iter++ { 351 tot += simd.Count3Bytes(src, 'A', 'T', 'N') 352 } 353 return tot 354 } 355 356 func count3BytesStandardSubtask(dst, src []byte, nIter int) int { 357 tot := 0 358 vals := []byte{'A', 'T', 'N'} 359 for iter := 0; iter < nIter; iter++ { 360 tot += count3BytesStandard(src, vals) 361 } 362 return tot 363 } 364 365 func Benchmark_Count3Bytes(b *testing.B) { 366 funcs := []taggedMultiBenchFunc{ 367 { 368 f: count3BytesSimdSubtask, 369 tag: "SIMD", 370 }, 371 { 372 f: count3BytesStandardSubtask, 373 tag: "Standard", 374 }, 375 } 376 for _, f := range funcs { 377 multiBenchmark(f.f, f.tag+"Short", 0, 150, 9999999, b) 378 multiBenchmark(f.f, f.tag+"Long", 0, 249250621, 50, b) 379 } 380 } 381 382 func countNibblesInSetSlow(src []byte, tablePtr *simd.NibbleLookupTable) int { 383 cnt := 0 384 for _, srcByte := range src { 385 cnt += int(tablePtr.Get(srcByte&15) + tablePtr.Get(srcByte>>4)) 386 } 387 return cnt 388 } 389 390 func TestCountNibblesInSet(t *testing.T) { 391 maxSize := 10000 392 nIter := 200 393 srcArr := simd.MakeUnsafe(maxSize) 394 var table [16]byte 395 for iter := 0; iter < nIter; iter++ { 396 sliceStart := rand.Intn(maxSize) 397 sliceEnd := sliceStart + rand.Intn(maxSize-sliceStart) 398 srcSlice := srcArr[sliceStart:sliceEnd] 399 for ii := range srcSlice { 400 srcSlice[ii] = byte(rand.Intn(256)) 401 } 402 baseCode1 := byte(rand.Intn(15)) 403 baseCode2 := baseCode1 + 1 + byte(rand.Intn(int(15-baseCode1))) 404 table[baseCode1] = 1 405 table[baseCode2] = 1 406 nlt := simd.MakeNibbleLookupTable(table) 407 408 result1 := countNibblesInSetSlow(srcSlice, &nlt) 409 result2 := simd.CountNibblesInSet(srcSlice, &nlt) 410 if result1 != result2 { 411 t.Fatal("Mismatched CountNibblesInSet result.") 412 } 413 table[baseCode1] = 0 414 table[baseCode2] = 0 415 } 416 } 417 418 func TestCountNibblesInTwoSets(t *testing.T) { 419 maxSize := 10000 420 nIter := 200 421 srcArr := simd.MakeUnsafe(maxSize) 422 var table1, table2 [16]byte 423 for iter := 0; iter < nIter; iter++ { 424 sliceStart := rand.Intn(maxSize) 425 sliceEnd := sliceStart + rand.Intn(maxSize-sliceStart) 426 srcSlice := srcArr[sliceStart:sliceEnd] 427 for ii := range srcSlice { 428 srcSlice[ii] = byte(rand.Intn(256)) 429 } 430 baseCode1 := byte(rand.Intn(15)) 431 baseCode2 := baseCode1 + 1 + byte(rand.Intn(int(15-baseCode1))) 432 table1[baseCode1] = 1 433 table1[baseCode2] = 1 434 435 for ii := 0; ii != 5; ii++ { 436 table2[rand.Intn(16)] = 1 437 } 438 nlt1 := simd.MakeNibbleLookupTable(table1) 439 nlt2 := simd.MakeNibbleLookupTable(table2) 440 441 result1a := countNibblesInSetSlow(srcSlice, &nlt1) 442 result1b := countNibblesInSetSlow(srcSlice, &nlt2) 443 result2a, result2b := simd.CountNibblesInTwoSets(srcSlice, &nlt1, &nlt2) 444 if (result1a != result2a) || (result1b != result2b) { 445 t.Fatal("Mismatched CountNibblesInTwoSets result.") 446 } 447 table1[baseCode1] = 0 448 table1[baseCode2] = 0 449 for pos := range table2 { 450 table2[pos] = 0 451 } 452 } 453 } 454 455 func countUnpackedNibblesInSetSlow(src []byte, tablePtr *simd.NibbleLookupTable) int { 456 cnt := 0 457 for _, srcByte := range src { 458 cnt += int(tablePtr.Get(srcByte)) 459 } 460 return cnt 461 } 462 463 func TestCountUnpackedNibblesInSet(t *testing.T) { 464 maxSize := 10000 465 nIter := 200 466 srcArr := simd.MakeUnsafe(maxSize) 467 var table [16]byte 468 for iter := 0; iter < nIter; iter++ { 469 sliceStart := rand.Intn(maxSize) 470 sliceEnd := sliceStart + rand.Intn(maxSize-sliceStart) 471 srcSlice := srcArr[sliceStart:sliceEnd] 472 for ii := range srcSlice { 473 srcSlice[ii] = byte(rand.Intn(16)) 474 } 475 baseCode1 := byte(rand.Intn(15)) 476 baseCode2 := baseCode1 + 1 + byte(rand.Intn(int(15-baseCode1))) 477 table[baseCode1] = 1 478 table[baseCode2] = 1 479 nlt := simd.MakeNibbleLookupTable(table) 480 481 result1 := countUnpackedNibblesInSetSlow(srcSlice, &nlt) 482 result2 := simd.CountUnpackedNibblesInSet(srcSlice, &nlt) 483 if result1 != result2 { 484 t.Fatal("Mismatched CountUnpackedNibblesInSet result.") 485 } 486 table[baseCode1] = 0 487 table[baseCode2] = 0 488 } 489 } 490 491 func TestCountUnpackedNibblesInTwoSets(t *testing.T) { 492 maxSize := 10000 493 nIter := 200 494 srcArr := simd.MakeUnsafe(maxSize) 495 var table1, table2 [16]byte 496 for iter := 0; iter < nIter; iter++ { 497 sliceStart := rand.Intn(maxSize) 498 sliceEnd := sliceStart + rand.Intn(maxSize-sliceStart) 499 srcSlice := srcArr[sliceStart:sliceEnd] 500 for ii := range srcSlice { 501 srcSlice[ii] = byte(rand.Intn(16)) 502 } 503 baseCode1 := byte(rand.Intn(15)) 504 baseCode2 := baseCode1 + 1 + byte(rand.Intn(int(15-baseCode1))) 505 table1[baseCode1] = 1 506 table1[baseCode2] = 1 507 508 for ii := 0; ii != 5; ii++ { 509 table2[rand.Intn(16)] = 1 510 } 511 nlt1 := simd.MakeNibbleLookupTable(table1) 512 nlt2 := simd.MakeNibbleLookupTable(table2) 513 514 result1a := countUnpackedNibblesInSetSlow(srcSlice, &nlt1) 515 result1b := countUnpackedNibblesInSetSlow(srcSlice, &nlt2) 516 result2a, result2b := simd.CountUnpackedNibblesInTwoSets(srcSlice, &nlt1, &nlt2) 517 if (result1a != result2a) || (result1b != result2b) { 518 t.Fatal("Mismatched CountUnpackedNibblesInTwoSets result.") 519 } 520 table1[baseCode1] = 0 521 table1[baseCode2] = 0 522 for pos := range table2 { 523 table2[pos] = 0 524 } 525 } 526 } 527 528 func accumulate8Slow(src []byte) int { 529 cnt := 0 530 for _, srcByte := range src { 531 cnt += int(srcByte) 532 } 533 return cnt 534 } 535 536 func TestAccumulate8(t *testing.T) { 537 maxSize := 500 538 nIter := 200 539 srcArr := simd.MakeUnsafe(maxSize) 540 for iter := 0; iter < nIter; iter++ { 541 sliceStart := rand.Intn(maxSize) 542 sliceEnd := sliceStart + rand.Intn(maxSize-sliceStart) 543 srcSlice := srcArr[sliceStart:sliceEnd] 544 for ii := range srcSlice { 545 srcSlice[ii] = byte(rand.Intn(256)) 546 } 547 548 result1 := accumulate8Slow(srcSlice) 549 result2 := simd.Accumulate8(srcSlice) 550 if result1 != result2 { 551 t.Fatal("Mismatched Accumulate8 result.") 552 } 553 } 554 } 555 556 /* 557 Benchmark results: 558 MacBook Pro (15-inch, 2016) 559 2.7 GHz Intel Core i7, 16 GB 2133 MHz LPDDR3 560 561 Benchmark_Accumulate8/SIMDShort1Cpu-8 20 92560842 ns/op 562 Benchmark_Accumulate8/SIMDShortHalfCpu-8 50 24796260 ns/op 563 Benchmark_Accumulate8/SIMDShortAllCpu-8 100 21541910 ns/op 564 Benchmark_Accumulate8/SIMDLong1Cpu-8 2 778781187 ns/op 565 Benchmark_Accumulate8/SIMDLongHalfCpu-8 3 466101270 ns/op 566 Benchmark_Accumulate8/SIMDLongAllCpu-8 3 472125495 ns/op 567 Benchmark_Accumulate8/SlowShort1Cpu-8 2 725211331 ns/op 568 Benchmark_Accumulate8/SlowShortHalfCpu-8 10 192303935 ns/op 569 Benchmark_Accumulate8/SlowShortAllCpu-8 10 146159760 ns/op 570 Benchmark_Accumulate8/SlowLong1Cpu-8 1 5371110621 ns/op 571 Benchmark_Accumulate8/SlowLongHalfCpu-8 1 1473946277 ns/op 572 Benchmark_Accumulate8/SlowLongAllCpu-8 1 1118962315 ns/op 573 */ 574 575 func accumulate8SimdSubtask(dst, src []byte, nIter int) int { 576 tot := 0 577 for iter := 0; iter < nIter; iter++ { 578 tot += simd.Accumulate8(src) 579 } 580 return tot 581 } 582 583 func accumulate8SlowSubtask(dst, src []byte, nIter int) int { 584 tot := 0 585 for iter := 0; iter < nIter; iter++ { 586 tot += accumulate8Slow(src) 587 } 588 return tot 589 } 590 591 func Benchmark_Accumulate8(b *testing.B) { 592 funcs := []taggedMultiBenchFunc{ 593 { 594 f: accumulate8SimdSubtask, 595 tag: "SIMD", 596 }, 597 { 598 f: accumulate8SlowSubtask, 599 tag: "Slow", 600 }, 601 } 602 for _, f := range funcs { 603 multiBenchmark(f.f, f.tag+"Short", 0, 150, 9999999, b) 604 multiBenchmark(f.f, f.tag+"Long", 0, 249250621, 50, b) 605 } 606 } 607 608 func accumulate8GreaterSlow(src []byte, val byte) int { 609 cnt := 0 610 for _, srcByte := range src { 611 if srcByte > val { 612 cnt += int(srcByte) 613 } 614 } 615 return cnt 616 } 617 618 func TestAccumulate8Greater(t *testing.T) { 619 maxSize := 500 620 nIter := 200 621 srcArr := simd.MakeUnsafe(maxSize) 622 for iter := 0; iter < nIter; iter++ { 623 sliceStart := rand.Intn(maxSize) 624 sliceEnd := sliceStart + rand.Intn(maxSize-sliceStart) 625 srcSlice := srcArr[sliceStart:sliceEnd] 626 for ii := range srcSlice { 627 srcSlice[ii] = byte(rand.Intn(256)) 628 } 629 630 val := byte(rand.Intn(256)) 631 632 result1 := accumulate8GreaterSlow(srcSlice, val) 633 result2 := simd.Accumulate8Greater(srcSlice, val) 634 if result1 != result2 { 635 t.Fatal("Mismatched Accumulate8Greater result.") 636 } 637 } 638 } 639 640 /* 641 Benchmark results: 642 MacBook Pro (15-inch, 2016) 643 2.7 GHz Intel Core i7, 16 GB 2133 MHz LPDDR3 644 645 Benchmark_Accumulate8Greater/SIMDShort1Cpu-8 10 137436870 ns/op 646 Benchmark_Accumulate8Greater/SIMDShortHalfCpu-8 50 36257710 ns/op 647 Benchmark_Accumulate8Greater/SIMDShortAllCpu-8 50 32131334 ns/op 648 Benchmark_Accumulate8Greater/SIMDLong1Cpu-8 2 895831574 ns/op 649 Benchmark_Accumulate8Greater/SIMDLongHalfCpu-8 2 501501504 ns/op 650 Benchmark_Accumulate8Greater/SIMDLongAllCpu-8 3 473122019 ns/op 651 Benchmark_Accumulate8Greater/SlowShort1Cpu-8 1 1026311714 ns/op 652 Benchmark_Accumulate8Greater/SlowShortHalfCpu-8 5 270841153 ns/op 653 Benchmark_Accumulate8Greater/SlowShortAllCpu-8 5 254131935 ns/op 654 Benchmark_Accumulate8Greater/SlowLong1Cpu-8 1 7651910478 ns/op 655 Benchmark_Accumulate8Greater/SlowLongHalfCpu-8 1 2113221447 ns/op 656 Benchmark_Accumulate8Greater/SlowLongAllCpu-8 1 2047822921 ns/op 657 */ 658 659 func accumulate8GreaterSimdSubtask(dst, src []byte, nIter int) int { 660 tot := 0 661 for iter := 0; iter < nIter; iter++ { 662 tot += simd.Accumulate8Greater(src, 14) 663 } 664 return tot 665 } 666 667 func accumulate8GreaterSlowSubtask(dst, src []byte, nIter int) int { 668 tot := 0 669 for iter := 0; iter < nIter; iter++ { 670 tot += accumulate8GreaterSlow(src, 14) 671 } 672 return tot 673 } 674 675 func Benchmark_Accumulate8Greater(b *testing.B) { 676 funcs := []taggedMultiBenchFunc{ 677 { 678 f: accumulate8GreaterSimdSubtask, 679 tag: "SIMD", 680 }, 681 { 682 f: accumulate8GreaterSlowSubtask, 683 tag: "Slow", 684 }, 685 } 686 for _, f := range funcs { 687 multiBenchmark(f.f, f.tag+"Short", 0, 150, 9999999, b) 688 multiBenchmark(f.f, f.tag+"Long", 0, 249250621, 50, b) 689 } 690 }