github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/simd/count_test.go

github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/simd/count_test.go (about)

     1  // Copyright 2021 GRAIL, Inc.  All rights reserved.
     2  // Use of this source code is governed by the Apache-2.0
     3  // license that can be found in the LICENSE file.
     4  
     5  package simd_test
     6  
     7  import (
     8  	"bytes"
     9  	"math/bits"
    10  	"math/rand"
    11  	"reflect"
    12  	"testing"
    13  	"unsafe"
    14  
    15  	"github.com/Schaudge/grailbase/simd"
    16  )
    17  
    18  func init() {
    19  	if unsafe.Sizeof(uintptr(0)) != 8 {
    20  		// popcnt_amd64.go shouldn't compile at all in this case, but just in
    21  		// case...
    22  		panic("8-byte words required.")
    23  	}
    24  }
    25  
    26  func popcntBytesNoasm(byteslice []byte) int {
    27  	bytesliceHeader := (*reflect.SliceHeader)(unsafe.Pointer(&byteslice))
    28  	ct := uintptr(len(byteslice))
    29  
    30  	bytearr := unsafe.Pointer(bytesliceHeader.Data)
    31  	endptr := unsafe.Add(bytearr, ct)
    32  	tot := 0
    33  	nLeadingByte := ct % 8
    34  	if nLeadingByte != 0 {
    35  		leadingWord := uint64(0)
    36  		if (nLeadingByte & 1) != 0 {
    37  			leadingWord = (uint64)(*(*byte)(bytearr))
    38  			bytearr = unsafe.Add(bytearr, 1)
    39  		}
    40  		if (nLeadingByte & 2) != 0 {
    41  			leadingWord <<= 16
    42  			leadingWord |= (uint64)(*(*uint16)(bytearr))
    43  			bytearr = unsafe.Add(bytearr, 2)
    44  		}
    45  		if (nLeadingByte & 4) != 0 {
    46  			leadingWord <<= 32
    47  			leadingWord |= (uint64)(*(*uint32)(bytearr))
    48  			bytearr = unsafe.Add(bytearr, 4)
    49  		}
    50  		tot = bits.OnesCount64(leadingWord)
    51  	}
    52  	// Strangely, performance of this loop seems to vary by ~20% on my Mac,
    53  	// depending on which of several equivalent ways I use to write it.
    54  	for bytearr != endptr {
    55  		tot += bits.OnesCount64((uint64)(*((*uint64)(bytearr))))
    56  		bytearr = unsafe.Add(bytearr, 8)
    57  	}
    58  	return tot
    59  }
    60  
    61  func popcntBytesSlow(bytes []byte) int {
    62  	// Slow (factor of 5-8x), but straightforward-to-verify implementation.
    63  	tot := 0
    64  	for _, b := range bytes {
    65  		tot += bits.OnesCount8(b)
    66  	}
    67  	return tot
    68  }
    69  
    70  func TestBytePopcnt(t *testing.T) {
    71  	// Generate a random string, then popcount 20000 random slices with lengths
    72  	// in [0, 5000).
    73  	maxSize := 5000
    74  	nIter := 20000
    75  	byteArr := make([]byte, 2*maxSize)
    76  	for i := range byteArr {
    77  		byteArr[i] = byte(rand.Intn(256))
    78  	}
    79  	for iter := 0; iter < nIter; iter++ {
    80  		sliceStart := rand.Intn(maxSize)
    81  		sliceEnd := sliceStart + rand.Intn(maxSize)
    82  		curSlice := byteArr[sliceStart:sliceEnd]
    83  		sum1 := simd.Popcnt(curSlice)
    84  		sum2 := popcntBytesNoasm(curSlice)
    85  		if sum1 != sum2 {
    86  			t.Fatal("Mismatched popcounts (noasm).")
    87  		}
    88  	}
    89  	nVerifyIter := 1000
    90  	for iter := 0; iter < nVerifyIter; iter++ {
    91  		sliceStart := rand.Intn(maxSize)
    92  		sliceEnd := sliceStart + rand.Intn(maxSize)
    93  		curSlice := byteArr[sliceStart:sliceEnd]
    94  		sum1 := simd.Popcnt(curSlice)
    95  		sum2 := popcntBytesSlow(curSlice)
    96  		if sum1 != sum2 {
    97  			t.Fatal("Mismatched popcounts (slow).")
    98  		}
    99  	}
   100  }
   101  
   102  /*
   103  Benchmark results:
   104    MacBook Pro (15-inch, 2016)
   105    2.7 GHz Intel Core i7, 16 GB 2133 MHz LPDDR3
   106  
   107  Benchmark_Popcnt/SIMDShort1Cpu-8                      20          90993141 ns/op
   108  Benchmark_Popcnt/SIMDShortHalfCpu-8                   50          24639468 ns/op
   109  Benchmark_Popcnt/SIMDShortAllCpu-8                   100          23098747 ns/op
   110  Benchmark_Popcnt/SIMDLong1Cpu-8                        2         909927976 ns/op
   111  Benchmark_Popcnt/SIMDLongHalfCpu-8                     3         488961048 ns/op
   112  Benchmark_Popcnt/SIMDLongAllCpu-8                      3         466249901 ns/op
   113  Benchmark_Popcnt/NoasmShort1Cpu-8                     10         106873386 ns/op
   114  Benchmark_Popcnt/NoasmShortHalfCpu-8                  50          29290668 ns/op
   115  Benchmark_Popcnt/NoasmShortAllCpu-8                   50          29559455 ns/op
   116  Benchmark_Popcnt/NoasmLong1Cpu-8                       1        1217844097 ns/op
   117  Benchmark_Popcnt/NoasmLongHalfCpu-8                    2         507946501 ns/op
   118  Benchmark_Popcnt/NoasmLongAllCpu-8                     3         483458386 ns/op
   119  Benchmark_Popcnt/SlowShort1Cpu-8                       2         519449562 ns/op
   120  Benchmark_Popcnt/SlowShortHalfCpu-8                   10         139108095 ns/op
   121  Benchmark_Popcnt/SlowShortAllCpu-8                    10         143346876 ns/op
   122  Benchmark_Popcnt/SlowLong1Cpu-8                        1        7515831696 ns/op
   123  Benchmark_Popcnt/SlowLongHalfCpu-8                     1        2083880380 ns/op
   124  Benchmark_Popcnt/SlowLongAllCpu-8                      1        2064129411 ns/op
   125  
   126  Notes: The current SSE4.2 SIMD implementation just amounts to a 2x-unrolled
   127  OnesCount64 loop without flag-rechecking overhead; they're using the same
   128  underlying instruction.  AVX2/AVX-512 allow for faster bulk processing, though;
   129  see e.g. https://github.com/kimwalisch/libpopcnt .
   130  */
   131  
   132  func popcntSimdSubtask(dst, src []byte, nIter int) int {
   133  	sum := 0
   134  	for iter := 0; iter < nIter; iter++ {
   135  		sum += simd.Popcnt(src)
   136  	}
   137  	return sum
   138  }
   139  
   140  func popcntNoasmSubtask(dst, src []byte, nIter int) int {
   141  	sum := 0
   142  	for iter := 0; iter < nIter; iter++ {
   143  		sum += popcntBytesNoasm(src)
   144  	}
   145  	return sum
   146  }
   147  
   148  func popcntSlowSubtask(dst, src []byte, nIter int) int {
   149  	sum := 0
   150  	for iter := 0; iter < nIter; iter++ {
   151  		sum += popcntBytesSlow(src)
   152  	}
   153  	return sum
   154  }
   155  
   156  func Benchmark_Popcnt(b *testing.B) {
   157  	funcs := []taggedMultiBenchFunc{
   158  		{
   159  			f:   popcntSimdSubtask,
   160  			tag: "SIMD",
   161  		},
   162  		{
   163  			f:   popcntNoasmSubtask,
   164  			tag: "Noasm",
   165  		},
   166  		{
   167  			f:   popcntSlowSubtask,
   168  			tag: "Slow",
   169  		},
   170  	}
   171  	for _, f := range funcs {
   172  		multiBenchmark(f.f, f.tag+"Short", 0, 75, 9999999, b)
   173  		multiBenchmark(f.f, f.tag+"Long", 0, 249250621, 50, b)
   174  	}
   175  }
   176  
   177  var cgArr = [...]byte{'C', 'G'}
   178  
   179  func countCGStandard(src []byte) int {
   180  	return bytes.Count(src, cgArr[:1]) + bytes.Count(src, cgArr[1:2])
   181  }
   182  
   183  func countCGNaive(src []byte) int {
   184  	cnt := 0
   185  	for _, srcByte := range src {
   186  		// Note that (srcByte & 0xfb) == 'C' takes ~30% less time than this.
   187  		if srcByte == 'C' || srcByte == 'G' {
   188  			cnt++
   189  		}
   190  	}
   191  	return cnt
   192  }
   193  
   194  func TestCountCG(t *testing.T) {
   195  	maxSize := 10000
   196  	nIter := 200
   197  	srcArr := simd.MakeUnsafe(maxSize)
   198  	for iter := 0; iter < nIter; iter++ {
   199  		sliceStart := rand.Intn(maxSize)
   200  		sliceEnd := sliceStart + rand.Intn(maxSize-sliceStart)
   201  		srcSlice := srcArr[sliceStart:sliceEnd]
   202  		for ii := range srcSlice {
   203  			srcSlice[ii] = byte(rand.Intn(256))
   204  		}
   205  		result1 := countCGStandard(srcSlice)
   206  		result2 := simd.MaskThenCountByte(srcSlice, 0xfb, 'C')
   207  		if result1 != result2 {
   208  			t.Fatal("Mismatched MaskThenCountByte result.")
   209  		}
   210  		result2 = countCGNaive(srcSlice)
   211  		if result1 != result2 {
   212  			t.Fatal("Mismatched countCGStandard/countCGNaive results.")
   213  		}
   214  	}
   215  }
   216  
   217  /*
   218  Benchmark results:
   219    MacBook Pro (15-inch, 2016)
   220    2.7 GHz Intel Core i7, 16 GB 2133 MHz LPDDR3
   221  
   222  Benchmark_CountCG/SIMDShort1Cpu-8                     10         119280079 ns/op
   223  Benchmark_CountCG/SIMDShortHalfCpu-8                  50          34743805 ns/op
   224  Benchmark_CountCG/SIMDShortAllCpu-8                   50          28507338 ns/op
   225  Benchmark_CountCG/SIMDLong1Cpu-8                       2         765099599 ns/op
   226  Benchmark_CountCG/SIMDLongHalfCpu-8                    3         491655239 ns/op
   227  Benchmark_CountCG/SIMDLongAllCpu-8                     3         452592924 ns/op
   228  Benchmark_CountCG/StandardShort1Cpu-8                  5         237081120 ns/op
   229  Benchmark_CountCG/StandardShortHalfCpu-8              20          64949969 ns/op
   230  Benchmark_CountCG/StandardShortAllCpu-8               20          59167932 ns/op
   231  Benchmark_CountCG/StandardLong1Cpu-8                   1        1496389230 ns/op
   232  Benchmark_CountCG/StandardLongHalfCpu-8                2         931898463 ns/op
   233  Benchmark_CountCG/StandardLongAllCpu-8                 2         980615182 ns/op
   234  */
   235  
   236  func countCGSimdSubtask(dst, src []byte, nIter int) int {
   237  	tot := 0
   238  	for iter := 0; iter < nIter; iter++ {
   239  		tot += simd.MaskThenCountByte(src, 0xfb, 'C')
   240  	}
   241  	return tot
   242  }
   243  
   244  func countCGStandardSubtask(dst, src []byte, nIter int) int {
   245  	tot := 0
   246  	for iter := 0; iter < nIter; iter++ {
   247  		tot += countCGStandard(src)
   248  	}
   249  	return tot
   250  }
   251  
   252  func Benchmark_CountCG(b *testing.B) {
   253  	funcs := []taggedMultiBenchFunc{
   254  		{
   255  			f:   countCGSimdSubtask,
   256  			tag: "SIMD",
   257  		},
   258  		{
   259  			f:   countCGStandardSubtask,
   260  			tag: "Standard",
   261  		},
   262  	}
   263  	for _, f := range funcs {
   264  		multiBenchmark(f.f, f.tag+"Short", 0, 150, 9999999, b)
   265  		multiBenchmark(f.f, f.tag+"Long", 0, 249250621, 50, b)
   266  	}
   267  }
   268  
   269  func count2BytesStandard(src, vals []byte) int {
   270  	// Not 'Slow' since bytes.Count is decently optimized for a single byte.
   271  	return bytes.Count(src, vals[:1]) + bytes.Count(src, vals[1:2])
   272  }
   273  
   274  func TestCount2Bytes(t *testing.T) {
   275  	maxSize := 10000
   276  	nIter := 200
   277  	srcArr := simd.MakeUnsafe(maxSize)
   278  	vals := make([]byte, 2)
   279  	for iter := 0; iter < nIter; iter++ {
   280  		sliceStart := rand.Intn(maxSize)
   281  		sliceEnd := sliceStart + rand.Intn(maxSize-sliceStart)
   282  		// sliceEnd := sliceStart + rand.Intn(maxSize-sliceStart)&^15
   283  		srcSlice := srcArr[sliceStart:sliceEnd]
   284  		for ii := range srcSlice {
   285  			srcSlice[ii] = byte(rand.Intn(256))
   286  		}
   287  		val1 := byte(rand.Intn(256))
   288  		val2 := val1 + 1
   289  		vals[0] = val1
   290  		vals[1] = val2
   291  		result1 := count2BytesStandard(srcSlice, vals)
   292  		result2 := simd.Count2Bytes(srcSlice, val1, val2)
   293  		if result1 != result2 {
   294  			t.Fatal("Mismatched Count2Bytes result.")
   295  		}
   296  	}
   297  }
   298  
   299  func count3BytesStandard(src, vals []byte) int {
   300  	return bytes.Count(src, vals[:1]) + bytes.Count(src, vals[1:2]) + bytes.Count(src, vals[2:3])
   301  }
   302  
   303  func TestCount3Bytes(t *testing.T) {
   304  	maxSize := 10000
   305  	nIter := 200
   306  	srcArr := simd.MakeUnsafe(maxSize)
   307  	vals := make([]byte, 3)
   308  	for iter := 0; iter < nIter; iter++ {
   309  		sliceStart := rand.Intn(maxSize)
   310  		sliceEnd := sliceStart + rand.Intn(maxSize-sliceStart)
   311  		srcSlice := srcArr[sliceStart:sliceEnd]
   312  		for ii := range srcSlice {
   313  			srcSlice[ii] = byte(rand.Intn(256))
   314  		}
   315  		val1 := byte(rand.Intn(256))
   316  		val2 := val1 + 1
   317  		val3 := val1 + 2
   318  		vals[0] = val1
   319  		vals[1] = val2
   320  		vals[2] = val3
   321  		result1 := count3BytesStandard(srcSlice, vals)
   322  		result2 := simd.Count3Bytes(srcSlice, val1, val2, val3)
   323  		if result1 != result2 {
   324  			t.Fatal("Mismatched Count3Bytes result.")
   325  		}
   326  	}
   327  }
   328  
   329  /*
   330  Benchmark results:
   331    MacBook Pro (15-inch, 2016)
   332    2.7 GHz Intel Core i7, 16 GB 2133 MHz LPDDR3
   333  
   334  Benchmark_Count3Bytes/SIMDShort1Cpu-8                 10         141085860 ns/op
   335  Benchmark_Count3Bytes/SIMDShortHalfCpu-8              30          40371892 ns/op
   336  Benchmark_Count3Bytes/SIMDShortAllCpu-8               30          37769995 ns/op
   337  Benchmark_Count3Bytes/SIMDLong1Cpu-8                   2         945534510 ns/op
   338  Benchmark_Count3Bytes/SIMDLongHalfCpu-8                3         499146889 ns/op
   339  Benchmark_Count3Bytes/SIMDLongAllCpu-8                 3         475811932 ns/op
   340  Benchmark_Count3Bytes/StandardShort1Cpu-8              3         346637595 ns/op
   341  Benchmark_Count3Bytes/StandardShortHalfCpu-8          20          96524251 ns/op
   342  Benchmark_Count3Bytes/StandardShortAllCpu-8           20          87056185 ns/op
   343  Benchmark_Count3Bytes/StandardLong1Cpu-8               1        2260954596 ns/op
   344  Benchmark_Count3Bytes/StandardLongHalfCpu-8            1        1518757560 ns/op
   345  Benchmark_Count3Bytes/StandardLongAllCpu-8             1        1468352229 ns/op
   346  */
   347  
   348  func count3BytesSimdSubtask(dst, src []byte, nIter int) int {
   349  	tot := 0
   350  	for iter := 0; iter < nIter; iter++ {
   351  		tot += simd.Count3Bytes(src, 'A', 'T', 'N')
   352  	}
   353  	return tot
   354  }
   355  
   356  func count3BytesStandardSubtask(dst, src []byte, nIter int) int {
   357  	tot := 0
   358  	vals := []byte{'A', 'T', 'N'}
   359  	for iter := 0; iter < nIter; iter++ {
   360  		tot += count3BytesStandard(src, vals)
   361  	}
   362  	return tot
   363  }
   364  
   365  func Benchmark_Count3Bytes(b *testing.B) {
   366  	funcs := []taggedMultiBenchFunc{
   367  		{
   368  			f:   count3BytesSimdSubtask,
   369  			tag: "SIMD",
   370  		},
   371  		{
   372  			f:   count3BytesStandardSubtask,
   373  			tag: "Standard",
   374  		},
   375  	}
   376  	for _, f := range funcs {
   377  		multiBenchmark(f.f, f.tag+"Short", 0, 150, 9999999, b)
   378  		multiBenchmark(f.f, f.tag+"Long", 0, 249250621, 50, b)
   379  	}
   380  }
   381  
   382  func countNibblesInSetSlow(src []byte, tablePtr *simd.NibbleLookupTable) int {
   383  	cnt := 0
   384  	for _, srcByte := range src {
   385  		cnt += int(tablePtr.Get(srcByte&15) + tablePtr.Get(srcByte>>4))
   386  	}
   387  	return cnt
   388  }
   389  
   390  func TestCountNibblesInSet(t *testing.T) {
   391  	maxSize := 10000
   392  	nIter := 200
   393  	srcArr := simd.MakeUnsafe(maxSize)
   394  	var table [16]byte
   395  	for iter := 0; iter < nIter; iter++ {
   396  		sliceStart := rand.Intn(maxSize)
   397  		sliceEnd := sliceStart + rand.Intn(maxSize-sliceStart)
   398  		srcSlice := srcArr[sliceStart:sliceEnd]
   399  		for ii := range srcSlice {
   400  			srcSlice[ii] = byte(rand.Intn(256))
   401  		}
   402  		baseCode1 := byte(rand.Intn(15))
   403  		baseCode2 := baseCode1 + 1 + byte(rand.Intn(int(15-baseCode1)))
   404  		table[baseCode1] = 1
   405  		table[baseCode2] = 1
   406  		nlt := simd.MakeNibbleLookupTable(table)
   407  
   408  		result1 := countNibblesInSetSlow(srcSlice, &nlt)
   409  		result2 := simd.CountNibblesInSet(srcSlice, &nlt)
   410  		if result1 != result2 {
   411  			t.Fatal("Mismatched CountNibblesInSet result.")
   412  		}
   413  		table[baseCode1] = 0
   414  		table[baseCode2] = 0
   415  	}
   416  }
   417  
   418  func TestCountNibblesInTwoSets(t *testing.T) {
   419  	maxSize := 10000
   420  	nIter := 200
   421  	srcArr := simd.MakeUnsafe(maxSize)
   422  	var table1, table2 [16]byte
   423  	for iter := 0; iter < nIter; iter++ {
   424  		sliceStart := rand.Intn(maxSize)
   425  		sliceEnd := sliceStart + rand.Intn(maxSize-sliceStart)
   426  		srcSlice := srcArr[sliceStart:sliceEnd]
   427  		for ii := range srcSlice {
   428  			srcSlice[ii] = byte(rand.Intn(256))
   429  		}
   430  		baseCode1 := byte(rand.Intn(15))
   431  		baseCode2 := baseCode1 + 1 + byte(rand.Intn(int(15-baseCode1)))
   432  		table1[baseCode1] = 1
   433  		table1[baseCode2] = 1
   434  
   435  		for ii := 0; ii != 5; ii++ {
   436  			table2[rand.Intn(16)] = 1
   437  		}
   438  		nlt1 := simd.MakeNibbleLookupTable(table1)
   439  		nlt2 := simd.MakeNibbleLookupTable(table2)
   440  
   441  		result1a := countNibblesInSetSlow(srcSlice, &nlt1)
   442  		result1b := countNibblesInSetSlow(srcSlice, &nlt2)
   443  		result2a, result2b := simd.CountNibblesInTwoSets(srcSlice, &nlt1, &nlt2)
   444  		if (result1a != result2a) || (result1b != result2b) {
   445  			t.Fatal("Mismatched CountNibblesInTwoSets result.")
   446  		}
   447  		table1[baseCode1] = 0
   448  		table1[baseCode2] = 0
   449  		for pos := range table2 {
   450  			table2[pos] = 0
   451  		}
   452  	}
   453  }
   454  
   455  func countUnpackedNibblesInSetSlow(src []byte, tablePtr *simd.NibbleLookupTable) int {
   456  	cnt := 0
   457  	for _, srcByte := range src {
   458  		cnt += int(tablePtr.Get(srcByte))
   459  	}
   460  	return cnt
   461  }
   462  
   463  func TestCountUnpackedNibblesInSet(t *testing.T) {
   464  	maxSize := 10000
   465  	nIter := 200
   466  	srcArr := simd.MakeUnsafe(maxSize)
   467  	var table [16]byte
   468  	for iter := 0; iter < nIter; iter++ {
   469  		sliceStart := rand.Intn(maxSize)
   470  		sliceEnd := sliceStart + rand.Intn(maxSize-sliceStart)
   471  		srcSlice := srcArr[sliceStart:sliceEnd]
   472  		for ii := range srcSlice {
   473  			srcSlice[ii] = byte(rand.Intn(16))
   474  		}
   475  		baseCode1 := byte(rand.Intn(15))
   476  		baseCode2 := baseCode1 + 1 + byte(rand.Intn(int(15-baseCode1)))
   477  		table[baseCode1] = 1
   478  		table[baseCode2] = 1
   479  		nlt := simd.MakeNibbleLookupTable(table)
   480  
   481  		result1 := countUnpackedNibblesInSetSlow(srcSlice, &nlt)
   482  		result2 := simd.CountUnpackedNibblesInSet(srcSlice, &nlt)
   483  		if result1 != result2 {
   484  			t.Fatal("Mismatched CountUnpackedNibblesInSet result.")
   485  		}
   486  		table[baseCode1] = 0
   487  		table[baseCode2] = 0
   488  	}
   489  }
   490  
   491  func TestCountUnpackedNibblesInTwoSets(t *testing.T) {
   492  	maxSize := 10000
   493  	nIter := 200
   494  	srcArr := simd.MakeUnsafe(maxSize)
   495  	var table1, table2 [16]byte
   496  	for iter := 0; iter < nIter; iter++ {
   497  		sliceStart := rand.Intn(maxSize)
   498  		sliceEnd := sliceStart + rand.Intn(maxSize-sliceStart)
   499  		srcSlice := srcArr[sliceStart:sliceEnd]
   500  		for ii := range srcSlice {
   501  			srcSlice[ii] = byte(rand.Intn(16))
   502  		}
   503  		baseCode1 := byte(rand.Intn(15))
   504  		baseCode2 := baseCode1 + 1 + byte(rand.Intn(int(15-baseCode1)))
   505  		table1[baseCode1] = 1
   506  		table1[baseCode2] = 1
   507  
   508  		for ii := 0; ii != 5; ii++ {
   509  			table2[rand.Intn(16)] = 1
   510  		}
   511  		nlt1 := simd.MakeNibbleLookupTable(table1)
   512  		nlt2 := simd.MakeNibbleLookupTable(table2)
   513  
   514  		result1a := countUnpackedNibblesInSetSlow(srcSlice, &nlt1)
   515  		result1b := countUnpackedNibblesInSetSlow(srcSlice, &nlt2)
   516  		result2a, result2b := simd.CountUnpackedNibblesInTwoSets(srcSlice, &nlt1, &nlt2)
   517  		if (result1a != result2a) || (result1b != result2b) {
   518  			t.Fatal("Mismatched CountUnpackedNibblesInTwoSets result.")
   519  		}
   520  		table1[baseCode1] = 0
   521  		table1[baseCode2] = 0
   522  		for pos := range table2 {
   523  			table2[pos] = 0
   524  		}
   525  	}
   526  }
   527  
   528  func accumulate8Slow(src []byte) int {
   529  	cnt := 0
   530  	for _, srcByte := range src {
   531  		cnt += int(srcByte)
   532  	}
   533  	return cnt
   534  }
   535  
   536  func TestAccumulate8(t *testing.T) {
   537  	maxSize := 500
   538  	nIter := 200
   539  	srcArr := simd.MakeUnsafe(maxSize)
   540  	for iter := 0; iter < nIter; iter++ {
   541  		sliceStart := rand.Intn(maxSize)
   542  		sliceEnd := sliceStart + rand.Intn(maxSize-sliceStart)
   543  		srcSlice := srcArr[sliceStart:sliceEnd]
   544  		for ii := range srcSlice {
   545  			srcSlice[ii] = byte(rand.Intn(256))
   546  		}
   547  
   548  		result1 := accumulate8Slow(srcSlice)
   549  		result2 := simd.Accumulate8(srcSlice)
   550  		if result1 != result2 {
   551  			t.Fatal("Mismatched Accumulate8 result.")
   552  		}
   553  	}
   554  }
   555  
   556  /*
   557  Benchmark results:
   558    MacBook Pro (15-inch, 2016)
   559    2.7 GHz Intel Core i7, 16 GB 2133 MHz LPDDR3
   560  
   561  Benchmark_Accumulate8/SIMDShort1Cpu-8                 20          92560842 ns/op
   562  Benchmark_Accumulate8/SIMDShortHalfCpu-8              50          24796260 ns/op
   563  Benchmark_Accumulate8/SIMDShortAllCpu-8              100          21541910 ns/op
   564  Benchmark_Accumulate8/SIMDLong1Cpu-8                   2         778781187 ns/op
   565  Benchmark_Accumulate8/SIMDLongHalfCpu-8                3         466101270 ns/op
   566  Benchmark_Accumulate8/SIMDLongAllCpu-8                 3         472125495 ns/op
   567  Benchmark_Accumulate8/SlowShort1Cpu-8                  2         725211331 ns/op
   568  Benchmark_Accumulate8/SlowShortHalfCpu-8              10         192303935 ns/op
   569  Benchmark_Accumulate8/SlowShortAllCpu-8               10         146159760 ns/op
   570  Benchmark_Accumulate8/SlowLong1Cpu-8                   1        5371110621 ns/op
   571  Benchmark_Accumulate8/SlowLongHalfCpu-8                1        1473946277 ns/op
   572  Benchmark_Accumulate8/SlowLongAllCpu-8                 1        1118962315 ns/op
   573  */
   574  
   575  func accumulate8SimdSubtask(dst, src []byte, nIter int) int {
   576  	tot := 0
   577  	for iter := 0; iter < nIter; iter++ {
   578  		tot += simd.Accumulate8(src)
   579  	}
   580  	return tot
   581  }
   582  
   583  func accumulate8SlowSubtask(dst, src []byte, nIter int) int {
   584  	tot := 0
   585  	for iter := 0; iter < nIter; iter++ {
   586  		tot += accumulate8Slow(src)
   587  	}
   588  	return tot
   589  }
   590  
   591  func Benchmark_Accumulate8(b *testing.B) {
   592  	funcs := []taggedMultiBenchFunc{
   593  		{
   594  			f:   accumulate8SimdSubtask,
   595  			tag: "SIMD",
   596  		},
   597  		{
   598  			f:   accumulate8SlowSubtask,
   599  			tag: "Slow",
   600  		},
   601  	}
   602  	for _, f := range funcs {
   603  		multiBenchmark(f.f, f.tag+"Short", 0, 150, 9999999, b)
   604  		multiBenchmark(f.f, f.tag+"Long", 0, 249250621, 50, b)
   605  	}
   606  }
   607  
   608  func accumulate8GreaterSlow(src []byte, val byte) int {
   609  	cnt := 0
   610  	for _, srcByte := range src {
   611  		if srcByte > val {
   612  			cnt += int(srcByte)
   613  		}
   614  	}
   615  	return cnt
   616  }
   617  
   618  func TestAccumulate8Greater(t *testing.T) {
   619  	maxSize := 500
   620  	nIter := 200
   621  	srcArr := simd.MakeUnsafe(maxSize)
   622  	for iter := 0; iter < nIter; iter++ {
   623  		sliceStart := rand.Intn(maxSize)
   624  		sliceEnd := sliceStart + rand.Intn(maxSize-sliceStart)
   625  		srcSlice := srcArr[sliceStart:sliceEnd]
   626  		for ii := range srcSlice {
   627  			srcSlice[ii] = byte(rand.Intn(256))
   628  		}
   629  
   630  		val := byte(rand.Intn(256))
   631  
   632  		result1 := accumulate8GreaterSlow(srcSlice, val)
   633  		result2 := simd.Accumulate8Greater(srcSlice, val)
   634  		if result1 != result2 {
   635  			t.Fatal("Mismatched Accumulate8Greater result.")
   636  		}
   637  	}
   638  }
   639  
   640  /*
   641  Benchmark results:
   642    MacBook Pro (15-inch, 2016)
   643    2.7 GHz Intel Core i7, 16 GB 2133 MHz LPDDR3
   644  
   645  Benchmark_Accumulate8Greater/SIMDShort1Cpu-8                  10         137436870 ns/op
   646  Benchmark_Accumulate8Greater/SIMDShortHalfCpu-8               50          36257710 ns/op
   647  Benchmark_Accumulate8Greater/SIMDShortAllCpu-8                50          32131334 ns/op
   648  Benchmark_Accumulate8Greater/SIMDLong1Cpu-8                    2         895831574 ns/op
   649  Benchmark_Accumulate8Greater/SIMDLongHalfCpu-8                 2         501501504 ns/op
   650  Benchmark_Accumulate8Greater/SIMDLongAllCpu-8                  3         473122019 ns/op
   651  Benchmark_Accumulate8Greater/SlowShort1Cpu-8                   1        1026311714 ns/op
   652  Benchmark_Accumulate8Greater/SlowShortHalfCpu-8                5         270841153 ns/op
   653  Benchmark_Accumulate8Greater/SlowShortAllCpu-8                 5         254131935 ns/op
   654  Benchmark_Accumulate8Greater/SlowLong1Cpu-8                    1        7651910478 ns/op
   655  Benchmark_Accumulate8Greater/SlowLongHalfCpu-8                 1        2113221447 ns/op
   656  Benchmark_Accumulate8Greater/SlowLongAllCpu-8                  1        2047822921 ns/op
   657  */
   658  
   659  func accumulate8GreaterSimdSubtask(dst, src []byte, nIter int) int {
   660  	tot := 0
   661  	for iter := 0; iter < nIter; iter++ {
   662  		tot += simd.Accumulate8Greater(src, 14)
   663  	}
   664  	return tot
   665  }
   666  
   667  func accumulate8GreaterSlowSubtask(dst, src []byte, nIter int) int {
   668  	tot := 0
   669  	for iter := 0; iter < nIter; iter++ {
   670  		tot += accumulate8GreaterSlow(src, 14)
   671  	}
   672  	return tot
   673  }
   674  
   675  func Benchmark_Accumulate8Greater(b *testing.B) {
   676  	funcs := []taggedMultiBenchFunc{
   677  		{
   678  			f:   accumulate8GreaterSimdSubtask,
   679  			tag: "SIMD",
   680  		},
   681  		{
   682  			f:   accumulate8GreaterSlowSubtask,
   683  			tag: "Slow",
   684  		},
   685  	}
   686  	for _, f := range funcs {
   687  		multiBenchmark(f.f, f.tag+"Short", 0, 150, 9999999, b)
   688  		multiBenchmark(f.f, f.tag+"Long", 0, 249250621, 50, b)
   689  	}
   690  }