github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/simd/simd_test.go

github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/simd/simd_test.go (about)

     1  // Copyright 2018 GRAIL, Inc.  All rights reserved.
     2  // Use of this source code is governed by the Apache-2.0
     3  // license that can be found in the LICENSE file.
     4  
     5  package simd_test
     6  
     7  import (
     8  	"bytes"
     9  	"encoding/binary"
    10  	"math/rand"
    11  	"testing"
    12  
    13  	"github.com/Schaudge/grailbase/simd"
    14  	"github.com/grailbio/testutil/assert"
    15  )
    16  
    17  // This is the most-frequently-recommended implementation.  It's decent, so the
    18  // suffix is 'Standard' instead of 'Slow'.
    19  func memset8Standard(dst []byte, val byte) {
    20  	dstLen := len(dst)
    21  	if dstLen != 0 {
    22  		dst[0] = val
    23  		for i := 1; i < dstLen; {
    24  			i += copy(dst[i:], dst[:i])
    25  		}
    26  	}
    27  }
    28  
    29  func TestMemset8(t *testing.T) {
    30  	maxSize := 500
    31  	nIter := 200
    32  	main1Arr := simd.MakeUnsafe(maxSize)
    33  	main2Arr := simd.MakeUnsafe(maxSize)
    34  	main3Arr := simd.MakeUnsafe(maxSize)
    35  	for iter := 0; iter < nIter; iter++ {
    36  		sliceStart := rand.Intn(maxSize)
    37  		sliceEnd := sliceStart + rand.Intn(maxSize-sliceStart)
    38  		main1Slice := main1Arr[sliceStart:sliceEnd]
    39  		main2Slice := main2Arr[sliceStart:sliceEnd]
    40  		main3Slice := main3Arr[sliceStart:sliceEnd]
    41  		byteVal := byte(rand.Intn(256))
    42  		memset8Standard(main1Slice, byteVal)
    43  		simd.Memset8Unsafe(main2Slice, byteVal)
    44  		if !bytes.Equal(main1Slice, main2Slice) {
    45  			t.Fatal("Mismatched Memset8Unsafe result.")
    46  		}
    47  		sentinel := byte(rand.Intn(256))
    48  		if len(main3Slice) > 0 {
    49  			main3Slice[0] = 0
    50  		}
    51  		main3Arr[sliceEnd] = sentinel
    52  		simd.Memset8(main3Slice, byteVal)
    53  		if !bytes.Equal(main1Slice, main3Slice) {
    54  			t.Fatal("Mismatched Memset8 result.")
    55  		}
    56  		if main3Arr[sliceEnd] != sentinel {
    57  			t.Fatal("Memset8 clobbered an extra byte.")
    58  		}
    59  	}
    60  }
    61  
    62  /*
    63  Benchmark results:
    64    MacBook Pro (15-inch, 2016)
    65    2.7 GHz Intel Core i7, 16 GB 2133 MHz LPDDR3
    66  
    67  Benchmark_Memset8/SIMDShort1Cpu-8                     20          62706981 ns/op
    68  Benchmark_Memset8/SIMDShortHalfCpu-8                 100          17559573 ns/op
    69  Benchmark_Memset8/SIMDShortAllCpu-8                  100          17149982 ns/op
    70  Benchmark_Memset8/SIMDLong1Cpu-8                       1        1101524485 ns/op
    71  Benchmark_Memset8/SIMDLongHalfCpu-8                    2         925331938 ns/op
    72  Benchmark_Memset8/SIMDLongAllCpu-8                     2         971422170 ns/op
    73  Benchmark_Memset8/StandardShort1Cpu-8                  5         314689466 ns/op
    74  Benchmark_Memset8/StandardShortHalfCpu-8              20          88260588 ns/op
    75  Benchmark_Memset8/StandardShortAllCpu-8               20          84317546 ns/op
    76  Benchmark_Memset8/StandardLong1Cpu-8                   1        1082736141 ns/op
    77  Benchmark_Memset8/StandardLongHalfCpu-8                2         992904776 ns/op
    78  Benchmark_Memset8/StandardLongAllCpu-8                 1        1052452033 ns/op
    79  Benchmark_Memset8/RangeZeroShort1Cpu-8                30          44907924 ns/op
    80  Benchmark_Memset8/RangeZeroShortHalfCpu-8            100          24173280 ns/op
    81  Benchmark_Memset8/RangeZeroShortAllCpu-8             100          14991003 ns/op
    82  Benchmark_Memset8/RangeZeroLong1Cpu-8                  3         401003587 ns/op
    83  Benchmark_Memset8/RangeZeroLongHalfCpu-8               3         400711072 ns/op
    84  Benchmark_Memset8/RangeZeroLongAllCpu-8                3         404863223 ns/op
    85  
    86  Notes: simd.Memset8 is broadly useful for short arrays, though usually a bit
    87  worse than memclr.  However, memclr wins handily in the 249 MB long case on the
    88  test machine, thanks to AVX2 (and, in the AVX2 subroutine, cache-bypassing
    89  stores).
    90  When the simd.Memset8 AVX2 implementation is written, it should obviously
    91  imitate what memclr is doing.
    92  */
    93  
    94  func memset8SimdSubtask(dst, src []byte, nIter int) int {
    95  	for iter := 0; iter < nIter; iter++ {
    96  		simd.Memset8(dst, 78)
    97  	}
    98  	return int(dst[0])
    99  }
   100  
   101  func memset8StandardSubtask(dst, src []byte, nIter int) int {
   102  	for iter := 0; iter < nIter; iter++ {
   103  		memset8Standard(dst, 78)
   104  	}
   105  	return int(dst[0])
   106  }
   107  
   108  func memset8RangeZeroSubtask(dst, src []byte, nIter int) int {
   109  	for iter := 0; iter < nIter; iter++ {
   110  		// Compiler-recognized loop, which gets converted to a memclr call with
   111  		// fancier optimizations than simd.Memset8.
   112  		for pos := range dst {
   113  			dst[pos] = 0
   114  		}
   115  	}
   116  	return int(dst[0])
   117  }
   118  
   119  func Benchmark_Memset8(b *testing.B) {
   120  	funcs := []taggedMultiBenchFunc{
   121  		{
   122  			f:   memset8SimdSubtask,
   123  			tag: "SIMD",
   124  		},
   125  		{
   126  			f:   memset8StandardSubtask,
   127  			tag: "Standard",
   128  		},
   129  		{
   130  			f:   memset8RangeZeroSubtask,
   131  			tag: "RangeZero",
   132  		},
   133  	}
   134  	for _, f := range funcs {
   135  		// Base sequence in length-150 .bam read occupies 75 bytes, so 75 is a good
   136  		// size for the short-array benchmark.
   137  		multiBenchmark(f.f, f.tag+"Short", 75, 0, 9999999, b)
   138  		// GRCh37 chromosome 1 length is 249250621, so that's a plausible
   139  		// long-array use case.
   140  		multiBenchmark(f.f, f.tag+"Long", 249250621, 0, 50, b)
   141  	}
   142  }
   143  
   144  // This only matches UnpackedNibbleLookupInplace when all bytes < 128; the test
   145  // has been restricted accordingly.  _mm_shuffle_epi8()'s treatment of bytes >=
   146  // 128 usually isn't relevant.
   147  func unpackedNibbleLookupInplaceSlow(main []byte, tablePtr *simd.NibbleLookupTable) {
   148  	for idx := range main {
   149  		main[idx] = tablePtr.Get(main[idx] & 15)
   150  	}
   151  }
   152  
   153  func TestUnpackedNibbleLookup(t *testing.T) {
   154  	maxSize := 500
   155  	nIter := 200
   156  	main1Arr := simd.MakeUnsafe(maxSize)
   157  	main2Arr := simd.MakeUnsafe(maxSize)
   158  	main3Arr := simd.MakeUnsafe(maxSize)
   159  	main4Arr := simd.MakeUnsafe(maxSize)
   160  	main5Arr := simd.MakeUnsafe(maxSize)
   161  	table := simd.MakeNibbleLookupTable([16]byte{0, 1, 0, 2, 8, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0})
   162  	for iter := 0; iter < nIter; iter++ {
   163  		sliceStart := rand.Intn(maxSize)
   164  		sliceEnd := sliceStart + rand.Intn(maxSize-sliceStart)
   165  		main1Slice := main1Arr[sliceStart:sliceEnd]
   166  		for ii := range main1Slice {
   167  			main1Slice[ii] = byte(rand.Intn(128))
   168  		}
   169  		main2Slice := main2Arr[sliceStart:sliceEnd]
   170  		main3Slice := main3Arr[sliceStart:sliceEnd]
   171  		main4Slice := main4Arr[sliceStart:sliceEnd]
   172  		main5Slice := main5Arr[sliceStart:sliceEnd]
   173  
   174  		simd.UnpackedNibbleLookupUnsafe(main3Slice, main1Slice, &table)
   175  
   176  		sentinel := byte(rand.Intn(256))
   177  		main4Arr[sliceEnd] = sentinel
   178  		simd.UnpackedNibbleLookup(main4Slice, main1Slice, &table)
   179  
   180  		copy(main2Slice, main1Slice)
   181  		copy(main5Slice, main1Slice)
   182  
   183  		unpackedNibbleLookupInplaceSlow(main1Slice, &table)
   184  		simd.UnpackedNibbleLookupUnsafeInplace(main2Slice, &table)
   185  		if !bytes.Equal(main1Slice, main2Slice) {
   186  			t.Fatal("Mismatched UnpackedNibbleLookupUnsafeInplace result.")
   187  		}
   188  		if !bytes.Equal(main1Slice, main3Slice) {
   189  			t.Fatal("Mismatched UnpackedNibbleLookupUnsafe result.")
   190  		}
   191  		if !bytes.Equal(main1Slice, main4Slice) {
   192  			t.Fatal("Mismatched UnpackedNibbleLookup result.")
   193  		}
   194  		if main4Arr[sliceEnd] != sentinel {
   195  			t.Fatal("UnpackedNibbleLookup clobbered an extra byte.")
   196  		}
   197  
   198  		main5Arr[sliceEnd] = sentinel
   199  		simd.UnpackedNibbleLookupInplace(main5Slice, &table)
   200  		if !bytes.Equal(main1Slice, main5Slice) {
   201  			t.Fatal("Mismatched UnpackedNibbleLookupInplace result.")
   202  		}
   203  		if main5Arr[sliceEnd] != sentinel {
   204  			t.Fatal("UnpackedNibbleLookupInplace clobbered an extra byte.")
   205  		}
   206  	}
   207  }
   208  
   209  /*
   210  Benchmark results:
   211    MacBook Pro (15-inch, 2016)
   212    2.7 GHz Intel Core i7, 16 GB 2133 MHz LPDDR3
   213  
   214  Benchmark_UnpackedNibbleLookupInplace/SIMDShort1Cpu-8                 20         76720863 ns/op
   215  Benchmark_UnpackedNibbleLookupInplace/SIMDShortHalfCpu-8              50         22968008 ns/op
   216  Benchmark_UnpackedNibbleLookupInplace/SIMDShortAllCpu-8              100         18896633 ns/op
   217  Benchmark_UnpackedNibbleLookupInplace/SIMDLong1Cpu-8                   1       1046243684 ns/op
   218  Benchmark_UnpackedNibbleLookupInplace/SIMDLongHalfCpu-8                2        861622838 ns/op
   219  Benchmark_UnpackedNibbleLookupInplace/SIMDLongAllCpu-8                 2        944384349 ns/op
   220  Benchmark_UnpackedNibbleLookupInplace/SlowShort1Cpu-8                  2        532267799 ns/op
   221  Benchmark_UnpackedNibbleLookupInplace/SlowShortHalfCpu-8              10        144993320 ns/op
   222  Benchmark_UnpackedNibbleLookupInplace/SlowShortAllCpu-8               10        146218387 ns/op
   223  Benchmark_UnpackedNibbleLookupInplace/SlowLong1Cpu-8                   1       7745668548 ns/op
   224  Benchmark_UnpackedNibbleLookupInplace/SlowLongHalfCpu-8                1       2169127851 ns/op
   225  Benchmark_UnpackedNibbleLookupInplace/SlowLongAllCpu-8                 1       2164900359 ns/op
   226  */
   227  
   228  func unpackedNibbleLookupInplaceSimdSubtask(dst, src []byte, nIter int) int {
   229  	table := simd.MakeNibbleLookupTable([16]byte{0, 1, 0, 2, 8, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0})
   230  	for iter := 0; iter < nIter; iter++ {
   231  		// Note that this uses the result of one lookup operation as the input to
   232  		// the next.
   233  		// (Given the current table, all values should be 1 or 0 after 3 or more
   234  		// iterations.)
   235  		simd.UnpackedNibbleLookupInplace(dst, &table)
   236  	}
   237  	return int(dst[0])
   238  }
   239  
   240  func unpackedNibbleLookupInplaceSlowSubtask(dst, src []byte, nIter int) int {
   241  	table := simd.MakeNibbleLookupTable([16]byte{0, 1, 0, 2, 8, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0})
   242  	for iter := 0; iter < nIter; iter++ {
   243  		unpackedNibbleLookupInplaceSlow(dst, &table)
   244  	}
   245  	return int(dst[0])
   246  }
   247  
   248  func Benchmark_UnpackedNibbleLookupInplace(b *testing.B) {
   249  	funcs := []taggedMultiBenchFunc{
   250  		{
   251  			f:   unpackedNibbleLookupInplaceSimdSubtask,
   252  			tag: "SIMD",
   253  		},
   254  		{
   255  			f:   unpackedNibbleLookupInplaceSlowSubtask,
   256  			tag: "Slow",
   257  		},
   258  	}
   259  	for _, f := range funcs {
   260  		multiBenchmark(f.f, f.tag+"Short", 75, 0, 9999999, b)
   261  		multiBenchmark(f.f, f.tag+"Long", 249250621, 0, 50, b)
   262  	}
   263  }
   264  
   265  func packedNibbleLookupSlow(dst, src []byte, tablePtr *simd.NibbleLookupTable) {
   266  	dstLen := len(dst)
   267  	nSrcFullByte := dstLen / 2
   268  	srcOdd := dstLen & 1
   269  	for srcPos := 0; srcPos < nSrcFullByte; srcPos++ {
   270  		srcByte := src[srcPos]
   271  		dst[2*srcPos] = tablePtr.Get(srcByte & 15)
   272  		dst[2*srcPos+1] = tablePtr.Get(srcByte >> 4)
   273  	}
   274  	if srcOdd == 1 {
   275  		srcByte := src[nSrcFullByte]
   276  		dst[2*nSrcFullByte] = tablePtr.Get(srcByte & 15)
   277  	}
   278  }
   279  
   280  func TestPackedNibbleLookup(t *testing.T) {
   281  	maxDstSize := 500
   282  	maxSrcSize := (maxDstSize + 1) / 2
   283  	nIter := 200
   284  	srcArr := simd.MakeUnsafe(maxSrcSize)
   285  	dst1Arr := simd.MakeUnsafe(maxDstSize)
   286  	dst2Arr := simd.MakeUnsafe(maxDstSize)
   287  	table := simd.MakeNibbleLookupTable([16]byte{0, 1, 0, 2, 8, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0})
   288  	for iter := 0; iter < nIter; iter++ {
   289  		srcSliceStart := rand.Intn(maxSrcSize)
   290  		dstSliceStart := srcSliceStart * 2
   291  		dstSliceEnd := dstSliceStart + rand.Intn(maxDstSize-dstSliceStart)
   292  		srcSliceEnd := (dstSliceEnd + 1) / 2
   293  		srcSlice := srcArr[srcSliceStart:srcSliceEnd]
   294  		for ii := range srcSlice {
   295  			srcSlice[ii] = byte(rand.Intn(256))
   296  		}
   297  		dst1Slice := dst1Arr[dstSliceStart:dstSliceEnd]
   298  		dst2Slice := dst2Arr[dstSliceStart:dstSliceEnd]
   299  		packedNibbleLookupSlow(dst1Slice, srcSlice, &table)
   300  		simd.PackedNibbleLookupUnsafe(dst2Slice, srcSlice, &table)
   301  		if !bytes.Equal(dst1Slice, dst2Slice) {
   302  			t.Fatal("Mismatched PackedNibbleLookupUnsafe result.")
   303  		}
   304  		// ack, missed a PackedNibbleLookup bug: it didn't write some of the last
   305  		// few bytes in some cases, but that went undetected because the previous
   306  		// PackedNibbleLookupUnsafe call pre-filled those bytes correctly.
   307  		simd.Memset8Unsafe(dst2Arr, 0)
   308  		sentinel := byte(rand.Intn(256))
   309  		dst2Arr[dstSliceEnd] = sentinel
   310  		simd.PackedNibbleLookup(dst2Slice, srcSlice, &table)
   311  		if !bytes.Equal(dst1Slice, dst2Slice) {
   312  			t.Fatal("Mismatched PackedNibbleLookup result.")
   313  		}
   314  		if dst2Arr[dstSliceEnd] != sentinel {
   315  			t.Fatal("PackedNibbleLookup clobbered an extra byte.")
   316  		}
   317  	}
   318  }
   319  
   320  /*
   321  Benchmark results:
   322    MacBook Pro (15-inch, 2016)
   323    2.7 GHz Intel Core i7, 16 GB 2133 MHz LPDDR3
   324  
   325  Benchmark_PackedNibbleLookup/UnsafeShort1Cpu-8                10         143501956 ns/op
   326  Benchmark_PackedNibbleLookup/UnsafeShortHalfCpu-8             30          38748958 ns/op
   327  Benchmark_PackedNibbleLookup/UnsafeShortAllCpu-8              50          31982398 ns/op
   328  Benchmark_PackedNibbleLookup/UnsafeLong1Cpu-8                  1        1372142640 ns/op
   329  Benchmark_PackedNibbleLookup/UnsafeLongHalfCpu-8               1        1236198290 ns/op
   330  Benchmark_PackedNibbleLookup/UnsafeLongAllCpu-8                1        1265315746 ns/op
   331  Benchmark_PackedNibbleLookup/SIMDShort1Cpu-8                  10         158155872 ns/op
   332  Benchmark_PackedNibbleLookup/SIMDShortHalfCpu-8               30          43098347 ns/op
   333  Benchmark_PackedNibbleLookup/SIMDShortAllCpu-8                30          37593692 ns/op
   334  Benchmark_PackedNibbleLookup/SIMDLong1Cpu-8                    1        1407559630 ns/op
   335  Benchmark_PackedNibbleLookup/SIMDLongHalfCpu-8                 1        1244569913 ns/op
   336  Benchmark_PackedNibbleLookup/SIMDLongAllCpu-8                  1        1245648867 ns/op
   337  Benchmark_PackedNibbleLookup/SlowShort1Cpu-8                   1        1322739228 ns/op
   338  Benchmark_PackedNibbleLookup/SlowShortHalfCpu-8                3         381551545 ns/op
   339  Benchmark_PackedNibbleLookup/SlowShortAllCpu-8                 3         361846656 ns/op
   340  Benchmark_PackedNibbleLookup/SlowLong1Cpu-8                    1        9990188206 ns/op
   341  Benchmark_PackedNibbleLookup/SlowLongHalfCpu-8                 1        2855687759 ns/op
   342  Benchmark_PackedNibbleLookup/SlowLongAllCpu-8                  1        2877628266 ns/op
   343  
   344  Notes: Unsafe version of this function is also benchmarked, since the
   345  short-array safety penalty is a bit high here.  This is mainly an indicator of
   346  room for improvement in the safe function; I think it's clear at this point
   347  that we'll probably never need to use the Unsafe interface.
   348  */
   349  
   350  func packedNibbleLookupUnsafeSubtask(dst, src []byte, nIter int) int {
   351  	table := simd.MakeNibbleLookupTable([16]byte{0, 1, 0, 2, 8, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0})
   352  	for iter := 0; iter < nIter; iter++ {
   353  		simd.PackedNibbleLookupUnsafe(dst, src, &table)
   354  	}
   355  	return int(dst[0])
   356  }
   357  
   358  func packedNibbleLookupSimdSubtask(dst, src []byte, nIter int) int {
   359  	table := simd.MakeNibbleLookupTable([16]byte{0, 1, 0, 2, 8, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0})
   360  	for iter := 0; iter < nIter; iter++ {
   361  		simd.PackedNibbleLookup(dst, src, &table)
   362  	}
   363  	return int(dst[0])
   364  }
   365  
   366  func packedNibbleLookupSlowSubtask(dst, src []byte, nIter int) int {
   367  	table := simd.MakeNibbleLookupTable([16]byte{0, 1, 0, 2, 8, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0})
   368  	for iter := 0; iter < nIter; iter++ {
   369  		packedNibbleLookupSlow(dst, src, &table)
   370  	}
   371  	return int(dst[0])
   372  }
   373  
   374  func Benchmark_PackedNibbleLookup(b *testing.B) {
   375  	funcs := []taggedMultiBenchFunc{
   376  		{
   377  			f:   packedNibbleLookupUnsafeSubtask,
   378  			tag: "Unsafe",
   379  		},
   380  		{
   381  			f:   packedNibbleLookupSimdSubtask,
   382  			tag: "SIMD",
   383  		},
   384  		{
   385  			f:   packedNibbleLookupSlowSubtask,
   386  			tag: "Slow",
   387  		},
   388  	}
   389  	for _, f := range funcs {
   390  		multiBenchmark(f.f, f.tag+"Short", 150, 75, 9999999, b)
   391  		multiBenchmark(f.f, f.tag+"Long", 249250621, 249250622/2, 50, b)
   392  	}
   393  }
   394  
   395  func interleaveSlow(dst, even, odd []byte) {
   396  	dstLen := len(dst)
   397  	evenLen := (dstLen + 1) >> 1
   398  	oddLen := dstLen >> 1
   399  	for idx, oddByte := range odd {
   400  		dst[2*idx] = even[idx]
   401  		dst[2*idx+1] = oddByte
   402  	}
   403  	if oddLen != evenLen {
   404  		dst[oddLen*2] = even[oddLen]
   405  	}
   406  }
   407  
   408  func TestInterleave(t *testing.T) {
   409  	maxSrcSize := 500
   410  	maxDstSize := 2 * maxSrcSize
   411  	nIter := 200
   412  	evenArr := simd.MakeUnsafe(maxSrcSize)
   413  	oddArr := simd.MakeUnsafe(maxSrcSize)
   414  	dst1Arr := simd.MakeUnsafe(maxDstSize)
   415  	dst2Arr := simd.MakeUnsafe(maxDstSize)
   416  	for iter := 0; iter < nIter; iter++ {
   417  		srcSliceStart := rand.Intn(maxSrcSize)
   418  		dstSliceStart := srcSliceStart * 2
   419  		dstSliceEnd := dstSliceStart + rand.Intn(maxDstSize-dstSliceStart)
   420  		evenSliceEnd := (dstSliceEnd + 1) >> 1
   421  		oddSliceEnd := dstSliceEnd >> 1
   422  		evenSlice := evenArr[srcSliceStart:evenSliceEnd]
   423  		oddSlice := oddArr[srcSliceStart:oddSliceEnd]
   424  		for ii := range evenSlice {
   425  			evenSlice[ii] = byte(rand.Intn(256))
   426  		}
   427  		for ii := range oddSlice {
   428  			oddSlice[ii] = byte(rand.Intn(256))
   429  		}
   430  		dst1Slice := dst1Arr[dstSliceStart:dstSliceEnd]
   431  		dst2Slice := dst2Arr[dstSliceStart:dstSliceEnd]
   432  		interleaveSlow(dst1Slice, evenSlice, oddSlice)
   433  		simd.Interleave8Unsafe(dst2Slice, evenSlice, oddSlice)
   434  		if !bytes.Equal(dst1Slice, dst2Slice) {
   435  			t.Fatal("Mismatched Interleave8Unsafe result.")
   436  		}
   437  		sentinel := byte(rand.Intn(256))
   438  		dst2Arr[dstSliceEnd] = sentinel
   439  		simd.Interleave8(dst2Slice, evenSlice, oddSlice)
   440  		if !bytes.Equal(dst1Slice, dst2Slice) {
   441  			t.Fatal("Mismatched Interleave8 result.")
   442  		}
   443  		if dst2Arr[dstSliceEnd] != sentinel {
   444  			t.Fatal("Interleave8 clobbered an extra byte.")
   445  		}
   446  	}
   447  }
   448  
   449  /*
   450  Benchmark results:
   451    MacBook Pro (15-inch, 2016)
   452    2.7 GHz Intel Core i7, 16 GB 2133 MHz LPDDR3
   453  
   454  Benchmark_Interleave/UnsafeShort1Cpu-8                10         124397567 ns/op
   455  Benchmark_Interleave/UnsafeShortHalfCpu-8             50          33427370 ns/op
   456  Benchmark_Interleave/UnsafeShortAllCpu-8              50          27522495 ns/op
   457  Benchmark_Interleave/UnsafeLong1Cpu-8                  1        1364788736 ns/op
   458  Benchmark_Interleave/UnsafeLongHalfCpu-8               1        1194034677 ns/op
   459  Benchmark_Interleave/UnsafeLongAllCpu-8                1        1240540994 ns/op
   460  Benchmark_Interleave/SIMDShort1Cpu-8                  10         143574503 ns/op
   461  Benchmark_Interleave/SIMDShortHalfCpu-8               30          40429942 ns/op
   462  Benchmark_Interleave/SIMDShortAllCpu-8                50          30500450 ns/op
   463  Benchmark_Interleave/SIMDLong1Cpu-8                    1        1281952758 ns/op
   464  Benchmark_Interleave/SIMDLongHalfCpu-8                 1        1210134670 ns/op
   465  Benchmark_Interleave/SIMDLongAllCpu-8                  1        1284786977 ns/op
   466  Benchmark_Interleave/SlowShort1Cpu-8                   2         880545817 ns/op
   467  Benchmark_Interleave/SlowShortHalfCpu-8                5         234673823 ns/op
   468  Benchmark_Interleave/SlowShortAllCpu-8                 5         230332535 ns/op
   469  Benchmark_Interleave/SlowLong1Cpu-8                    1        6669283712 ns/op
   470  Benchmark_Interleave/SlowLongHalfCpu-8                 1        1860713287 ns/op
   471  Benchmark_Interleave/SlowLongAllCpu-8                  1        1807886977 ns/op
   472  */
   473  
   474  func interleaveUnsafeSubtask(dst, src []byte, nIter int) int {
   475  	for iter := 0; iter < nIter; iter++ {
   476  		simd.Interleave8Unsafe(dst, src, src)
   477  	}
   478  	return int(dst[0])
   479  }
   480  
   481  func interleaveSimdSubtask(dst, src []byte, nIter int) int {
   482  	for iter := 0; iter < nIter; iter++ {
   483  		simd.Interleave8(dst, src, src)
   484  	}
   485  	return int(dst[0])
   486  }
   487  
   488  func interleaveSlowSubtask(dst, src []byte, nIter int) int {
   489  	for iter := 0; iter < nIter; iter++ {
   490  		interleaveSlow(dst, src, src)
   491  	}
   492  	return int(dst[0])
   493  }
   494  
   495  func Benchmark_Interleave(b *testing.B) {
   496  	funcs := []taggedMultiBenchFunc{
   497  		{
   498  			f:   interleaveUnsafeSubtask,
   499  			tag: "Unsafe",
   500  		},
   501  		{
   502  			f:   interleaveSimdSubtask,
   503  			tag: "SIMD",
   504  		},
   505  		{
   506  			f:   interleaveSlowSubtask,
   507  			tag: "Slow",
   508  		},
   509  	}
   510  	for _, f := range funcs {
   511  		multiBenchmark(f.f, f.tag+"Short", 150, 75, 9999999, b)
   512  		multiBenchmark(f.f, f.tag+"Long", 124625311*2, 124625311, 50, b)
   513  	}
   514  }
   515  
   516  func reverse8Slow(main []byte) {
   517  	nByte := len(main)
   518  	nByteDiv2 := nByte >> 1
   519  	for idx, invIdx := 0, nByte-1; idx != nByteDiv2; idx, invIdx = idx+1, invIdx-1 {
   520  		main[idx], main[invIdx] = main[invIdx], main[idx]
   521  	}
   522  }
   523  
   524  func TestReverse8(t *testing.T) {
   525  	maxSize := 500
   526  	nIter := 200
   527  	main1Arr := simd.MakeUnsafe(maxSize)
   528  	main2Arr := simd.MakeUnsafe(maxSize)
   529  	main3Arr := simd.MakeUnsafe(maxSize)
   530  	main4Arr := simd.MakeUnsafe(maxSize)
   531  	src2Arr := simd.MakeUnsafe(maxSize)
   532  	for iter := 0; iter < nIter; iter++ {
   533  		sliceStart := rand.Intn(maxSize)
   534  		sliceEnd := sliceStart + rand.Intn(maxSize-sliceStart)
   535  		main1Slice := main1Arr[sliceStart:sliceEnd]
   536  		main2Slice := main2Arr[sliceStart:sliceEnd]
   537  		main3Slice := main3Arr[sliceStart:sliceEnd]
   538  		main4Slice := main4Arr[sliceStart:sliceEnd]
   539  		src2Slice := src2Arr[sliceStart:sliceEnd]
   540  		for ii := range main1Slice {
   541  			main1Slice[ii] = byte(rand.Intn(256))
   542  		}
   543  		copy(main2Slice, main1Slice)
   544  		copy(src2Slice, main1Slice)
   545  		sentinel := byte(rand.Intn(256))
   546  		main2Arr[sliceEnd] = sentinel
   547  		main4Arr[sliceEnd] = sentinel
   548  		simd.Reverse8Unsafe(main3Slice, main1Slice)
   549  		simd.Reverse8(main4Slice, main1Slice)
   550  		reverse8Slow(main1Slice)
   551  		simd.Reverse8Inplace(main2Slice)
   552  		if !bytes.Equal(main1Slice, main2Slice) {
   553  			t.Fatal("Mismatched Reverse8Inplace result.")
   554  		}
   555  		if main2Arr[sliceEnd] != sentinel {
   556  			t.Fatal("Reverse8Inplace clobbered an extra byte.")
   557  		}
   558  		if !bytes.Equal(main1Slice, main3Slice) {
   559  			t.Fatal("Mismatched Reverse8Unsafe result.")
   560  		}
   561  		if !bytes.Equal(main1Slice, main4Slice) {
   562  			t.Fatal("Mismatched Reverse8 result.")
   563  		}
   564  		if main4Arr[sliceEnd] != sentinel {
   565  			t.Fatal("Reverse8 clobbered an extra byte.")
   566  		}
   567  		simd.Reverse8Inplace(main4Slice)
   568  		if !bytes.Equal(src2Slice, main4Slice) {
   569  			t.Fatal("Reverse8Inplace didn't invert itself.")
   570  		}
   571  	}
   572  }
   573  
   574  /*
   575  Benchmark results:
   576    MacBook Pro (15-inch, 2016)
   577    2.7 GHz Intel Core i7, 16 GB 2133 MHz LPDDR3
   578  
   579  Benchmark_Reverse8Inplace/SIMDShort1Cpu-8                     20          67121510 ns/op
   580  Benchmark_Reverse8Inplace/SIMDShortHalfCpu-8                 100          18891965 ns/op
   581  Benchmark_Reverse8Inplace/SIMDShortAllCpu-8                  100          16177224 ns/op
   582  Benchmark_Reverse8Inplace/SIMDLong1Cpu-8                       1        1115497033 ns/op
   583  Benchmark_Reverse8Inplace/SIMDLongHalfCpu-8                    2         885764257 ns/op
   584  Benchmark_Reverse8Inplace/SIMDLongAllCpu-8                     2         941948715 ns/op
   585  Benchmark_Reverse8Inplace/SlowShort1Cpu-8                      3         398662666 ns/op
   586  Benchmark_Reverse8Inplace/SlowShortHalfCpu-8                  10         105618119 ns/op
   587  Benchmark_Reverse8Inplace/SlowShortAllCpu-8                   10         184808267 ns/op
   588  Benchmark_Reverse8Inplace/SlowLong1Cpu-8                       1        5665556658 ns/op
   589  Benchmark_Reverse8Inplace/SlowLongHalfCpu-8                    1        1597487158 ns/op
   590  Benchmark_Reverse8Inplace/SlowLongAllCpu-8                     1        1616963854 ns/op
   591  */
   592  
   593  func reverse8InplaceSimdSubtask(dst, src []byte, nIter int) int {
   594  	for iter := 0; iter < nIter; iter++ {
   595  		simd.Reverse8Inplace(dst)
   596  	}
   597  	return int(dst[0])
   598  }
   599  
   600  func reverse8InplaceSlowSubtask(dst, src []byte, nIter int) int {
   601  	for iter := 0; iter < nIter; iter++ {
   602  		reverse8Slow(dst)
   603  	}
   604  	return int(dst[0])
   605  }
   606  
   607  func Benchmark_Reverse8Inplace(b *testing.B) {
   608  	funcs := []taggedMultiBenchFunc{
   609  		{
   610  			f:   reverse8InplaceSimdSubtask,
   611  			tag: "SIMD",
   612  		},
   613  		{
   614  			f:   reverse8InplaceSlowSubtask,
   615  			tag: "Slow",
   616  		},
   617  	}
   618  	for _, f := range funcs {
   619  		multiBenchmark(f.f, f.tag+"Short", 75, 0, 9999999, b)
   620  		multiBenchmark(f.f, f.tag+"Long", 249250621, 0, 50, b)
   621  	}
   622  }
   623  
   624  func bitFromEveryByteSlow(dst, src []byte, bitIdx int) {
   625  	requiredDstLen := (len(src) + 7) >> 3
   626  	if (len(dst) < requiredDstLen) || (uint(bitIdx) > 7) {
   627  		panic("BitFromEveryByte requires len(dst) >= (len(src) + 7) / 8 and 0 <= bitIdx < 8.")
   628  	}
   629  	dst = dst[:requiredDstLen]
   630  	for i := range dst {
   631  		dst[i] = 0
   632  	}
   633  	for i, b := range src {
   634  		dst[i>>3] |= ((b >> uint32(bitIdx)) & 1) << uint32(i&7)
   635  	}
   636  }
   637  
   638  func bitFromEveryByteFancyNoasm(dst, src []byte, bitIdx int) {
   639  	requiredDstLen := (len(src) + 7) >> 3
   640  	if (len(dst) < requiredDstLen) || (uint(bitIdx) > 7) {
   641  		panic("BitFromEveryByte requires len(dst) >= (len(src) + 7) / 8 and 0 <= bitIdx < 8.")
   642  	}
   643  	nSrcFullWord := len(src) >> 3
   644  	for i := 0; i < nSrcFullWord; i++ {
   645  		// Tried using a unsafeBytesToWords function on src in place of
   646  		// binary.LittleEndian.Uint64, and it barely made any difference.
   647  		srcWord := binary.LittleEndian.Uint64(src[i*8:i*8+8]) >> uint32(bitIdx)
   648  
   649  		srcWord &= 0x101010101010101
   650  
   651  		// Before this operation, the bits of interest are at positions 0, 8, 16,
   652  		// 24, 32, 40, 48, and 56 in srcWord, and all other bits are guaranteed to
   653  		// be zero.
   654  		//
   655  		// Suppose the bit at position 16 is set, and no other bits are set.  What
   656  		// does multiplication by the magic number 0x102040810204080 accomplish?
   657  		// Well, the magic number has bits set at positions 7, 14, 21, 28, 35, 42,
   658  		// 49, and 56.  Multiplying by 2^16 is equivalent to left-shifting by 16,
   659  		// so the product has bits set at positions (7+16), (14+16), (21+16),
   660  		// (28+16), (35+16), (42+16), and the last two overflow off the top end.
   661  		//
   662  		// Now suppose the bits at position 0 and 16 are both set.  The result is
   663  		// then the sum of (2^0) * <magic number> + (2^16) * <magic number>.  The
   664  		// first term in this sum has bits set at positions 7, 14, ..., 56.
   665  		// Critically, *none of these bits overlap with the second term*, so there
   666  		// are no 'carries' when we add the two terms together.  So the final
   667  		// product has bits set at positions 7, 14, 21, 23, 28, 30, 35, 37, 42, 44,
   668  		// 49, 51, 56, and 58.
   669  		//
   670  		// It turns out that none of the bits in any of the 8 terms of this product
   671  		// have overlapping positions.  So the multiplication operation just makes
   672  		// a bunch of left-shifted copies of the original bits... and in
   673  		// particular, bits 56-63 of the product are:
   674  		//   56: original bit 0, left-shifted 56
   675  		//   57: original bit 8, left-shifted 49
   676  		//   58: original bit 16, left-shifted 42
   677  		//   59: original bit 24, left-shifted 35
   678  		//   60: original bit 32, left-shifted 28
   679  		//   61: original bit 40, left-shifted 21
   680  		//   62: original bit 48, left-shifted 14
   681  		//   63: original bit 56, left-shifted 7
   682  		// Thus, right-shifting the product by 56 gives us the byte we want.
   683  		//
   684  		// This is a very esoteric algorithm, and it doesn't have much direct
   685  		// application because all 64-bit x86 processors provide an assembly
   686  		// instruction which lets you do this >6 times as quickly.  Occasionally
   687  		// the idea of using multiplication to create staggered left-shifted copies
   688  		// of bits does genuinely come in handy, though.
   689  		dst[i] = byte((srcWord * 0x102040810204080) >> 56)
   690  	}
   691  	if nSrcFullWord != requiredDstLen {
   692  		srcLast := src[nSrcFullWord*8:]
   693  		dstLast := dst[nSrcFullWord:requiredDstLen]
   694  		for i := range dstLast {
   695  			dstLast[i] = 0
   696  		}
   697  		for i, b := range srcLast {
   698  			dstLast[i>>3] |= ((b >> uint32(bitIdx)) & 1) << uint32(i&7)
   699  		}
   700  	}
   701  }
   702  
   703  func TestBitFromEveryByte(t *testing.T) {
   704  	maxSize := 500
   705  	nIter := 200
   706  	rand.Seed(1)
   707  	srcArr := make([]byte, maxSize)
   708  	dstArr1 := make([]byte, maxSize)
   709  	dstArr2 := make([]byte, maxSize)
   710  	dstArr3 := make([]byte, maxSize)
   711  	for iter := 0; iter < nIter; iter++ {
   712  		sliceStart := rand.Intn(maxSize)
   713  		srcSize := rand.Intn(maxSize - sliceStart)
   714  		srcSliceEnd := sliceStart + srcSize
   715  		srcSlice := srcArr[sliceStart:srcSliceEnd]
   716  
   717  		minDstSize := (srcSize + 7) >> 3
   718  		dstSliceEnd := sliceStart + minDstSize
   719  		dstSlice1 := dstArr1[sliceStart:dstSliceEnd]
   720  		dstSlice2 := dstArr2[sliceStart:dstSliceEnd]
   721  		dstSlice3 := dstArr3[sliceStart:dstSliceEnd]
   722  
   723  		for ii := range srcSlice {
   724  			srcSlice[ii] = byte(rand.Intn(256))
   725  		}
   726  		sentinel := byte(rand.Intn(256))
   727  		dstArr2[dstSliceEnd] = sentinel
   728  
   729  		bitIdx := rand.Intn(8)
   730  		bitFromEveryByteSlow(dstSlice1, srcSlice, bitIdx)
   731  		simd.BitFromEveryByte(dstSlice2, srcSlice, bitIdx)
   732  		assert.EQ(t, dstSlice1, dstSlice2)
   733  		assert.EQ(t, sentinel, dstArr2[dstSliceEnd])
   734  
   735  		// Also validate the assembly-free multiplication-based algorithm.
   736  		sentinel = byte(rand.Intn(256))
   737  		dstArr3[dstSliceEnd] = sentinel
   738  		bitFromEveryByteFancyNoasm(dstSlice3, srcSlice, bitIdx)
   739  		assert.EQ(t, dstSlice1, dstSlice3)
   740  		assert.EQ(t, sentinel, dstArr3[dstSliceEnd])
   741  	}
   742  }
   743  
   744  /*
   745  Benchmark results:
   746    MacBook Pro (15-inch, 2016)
   747    2.7 GHz Intel Core i7, 16 GB 2133 MHz LPDDR3
   748  
   749  Benchmark_BitFromEveryByte/SIMDLong1Cpu-8                    200           6861450 ns/op
   750  Benchmark_BitFromEveryByte/SIMDLongHalfCpu-8                 200           7360937 ns/op
   751  Benchmark_BitFromEveryByte/SIMDLongAllCpu-8                  200           8846261 ns/op
   752  Benchmark_BitFromEveryByte/FancyNoasmLong1Cpu-8               20          58756902 ns/op
   753  Benchmark_BitFromEveryByte/FancyNoasmLongHalfCpu-8                   100         17244847 ns/op
   754  Benchmark_BitFromEveryByte/FancyNoasmLongAllCpu-8                    100         16624282 ns/op
   755  Benchmark_BitFromEveryByte/SlowLong1Cpu-8                              3        422073091 ns/op
   756  Benchmark_BitFromEveryByte/SlowLongHalfCpu-8                          10        117732813 ns/op
   757  Benchmark_BitFromEveryByte/SlowLongAllCpu-8                           10        114903556 ns/op
   758  
   759  Notes: 1Cpu has higher throughput than HalfCpu/AllCpu on this test machine due
   760  to L3 cache saturation: multiBenchmarkDstSrc makes each goroutine process its
   761  own ~4 MB job, rather than splitting a single job into smaller pieces, and a
   762  15-inch 2016 MacBook Pro has a 8 MB L3 cache.  If you shrink the test size to
   763  len(src)=400000, HalfCpu outperforms 1Cpu by the expected amount.
   764  
   765  I'm leaving this unusual benchmark result here since (i) it corresponds to how
   766  we actually need to use the function, and (ii) this phenomenon is definitely
   767  worth knowing about.
   768  */
   769  
   770  func bitFromEveryByteSimdSubtask(dst, src []byte, nIter int) int {
   771  	for iter := 0; iter < nIter; iter++ {
   772  		simd.BitFromEveryByte(dst, src, 0)
   773  	}
   774  	return int(dst[0])
   775  }
   776  
   777  func bitFromEveryByteFancyNoasmSubtask(dst, src []byte, nIter int) int {
   778  	for iter := 0; iter < nIter; iter++ {
   779  		bitFromEveryByteFancyNoasm(dst, src, 0)
   780  	}
   781  	return int(dst[0])
   782  }
   783  
   784  func bitFromEveryByteSlowSubtask(dst, src []byte, nIter int) int {
   785  	for iter := 0; iter < nIter; iter++ {
   786  		bitFromEveryByteSlow(dst, src, 0)
   787  	}
   788  	return int(dst[0])
   789  }
   790  
   791  func Benchmark_BitFromEveryByte(b *testing.B) {
   792  	funcs := []taggedMultiBenchFunc{
   793  		{
   794  			f:   bitFromEveryByteSimdSubtask,
   795  			tag: "SIMD",
   796  		},
   797  		{
   798  			f:   bitFromEveryByteFancyNoasmSubtask,
   799  			tag: "FancyNoasm",
   800  		},
   801  		{
   802  			f:   bitFromEveryByteSlowSubtask,
   803  			tag: "Slow",
   804  		},
   805  	}
   806  	for _, f := range funcs {
   807  		multiBenchmark(f.f, f.tag+"Long", 4091904/8, 4091904, 50, b)
   808  	}
   809  }