github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/simd/simd_amd64.go (about)

     1  // Copyright 2021 GRAIL, Inc.  All rights reserved.
     2  // Use of this source code is governed by the Apache-2.0
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build amd64 && !appengine
     6  // +build amd64,!appengine
     7  
     8  package simd
     9  
    10  import (
    11  	"math/bits"
    12  	"reflect"
    13  	"unsafe"
    14  
    15  	"golang.org/x/sys/cpu"
    16  )
    17  
    18  // amd64 compile-time constants.
    19  
    20  // BytesPerWord is the number of bytes in a machine word.
    21  // We don't use unsafe.Sizeof(uintptr(1)) since there are advantages to having
    22  // this as an untyped constant, and there's essentially no drawback since this
    23  // is an _amd64-specific file.
    24  const BytesPerWord = 8
    25  
    26  // Log2BytesPerWord is log2(BytesPerWord).  This is relevant for manual
    27  // bit-shifting when we know that's a safe way to divide and the compiler does
    28  // not (e.g. dividend is of signed int type).
    29  const Log2BytesPerWord = uint(3)
    30  
    31  // BitsPerWord is the number of bits in a machine word.
    32  const BitsPerWord = BytesPerWord * 8
    33  
    34  // This must be at least <maximum supported vector size> / 16.
    35  const nibbleLookupDup = 1
    36  
    37  // NibbleLookupTable represents a parallel-byte-substitution operation f, where
    38  // every byte b in a byte-slice is replaced with
    39  //   f(b) := shuffle[0][b & 15] for b <= 127, and
    40  //   f(b) := 0 for b > 127.
    41  // (The second part is usually irrelevant in practice, but must be defined this
    42  // way to allow _mm_shuffle_epi8()/_mm256_shuffle_epi8()/_mm512_shuffle_epi8()
    43  // to be used to implement the operation efficiently.)
    44  // It's named NibbleLookupTable rather than ByteLookupTable since only the
    45  // bottom nibble of each byte can be used for table lookup.
    46  // It potentially stores multiple adjacent copies of the lookup table since
    47  // that speeds up the AVX2 and AVX-512 use cases (the table can be loaded with
    48  // a single _mm256_loadu_si256 operation, instead of e.g. _mm_loadu_si128
    49  // followed by _mm256_set_m128i with the same argument twice), and the typical
    50  // use case involves initializing very few tables and using them many, many
    51  // times.
    52  type NibbleLookupTable struct {
    53  	shuffle [nibbleLookupDup][16]byte
    54  }
    55  
    56  // Get performs the b <= 127 part of the lookup operation described above.
    57  // The b > 127 branch is omitted because in many use cases (e.g.
    58  // PackedNibbleLookup below), it can be proven that b > 127 is impossible, and
    59  // removing the if-statement is a significant performance win when it's
    60  // possible.
    61  func (t *NibbleLookupTable) Get(b byte) byte {
    62  	return t.shuffle[0][b]
    63  }
    64  
    65  // const minPageSize = 4096  may be relevant for safe functions soon.
    66  
    67  // bytesPerVec is the size of the maximum-width vector that may be used.  It is
    68  // at least 16, but will soon be set to 32 if AVX2 support is detected.  It
    69  // may be set to 64 in the future when AVX-512 is detected.
    70  var bytesPerVec int
    71  
    72  // log2BytesPerVec supports efficient division by bytesPerVec.
    73  var log2BytesPerVec uint
    74  
    75  // *** the following functions are defined in simd_amd64.s
    76  
    77  // There was a unpackedNibbleLookupInplaceSSSE3Asm function here, but it
    78  // actually benchmarked worse than the general-case function.
    79  
    80  //go:noescape
    81  func unpackedNibbleLookupTinyInplaceSSSE3Asm(main, tablePtr unsafe.Pointer)
    82  
    83  //go:noescape
    84  func unpackedNibbleLookupOddInplaceSSSE3Asm(main, tablePtr unsafe.Pointer, nByte int)
    85  
    86  //go:noescape
    87  func unpackedNibbleLookupSSSE3Asm(dst, src, tablePtr unsafe.Pointer, nByte int)
    88  
    89  //go:noescape
    90  func unpackedNibbleLookupOddSSSE3Asm(dst, src, tablePtr unsafe.Pointer, nByte int)
    91  
    92  //go:noescape
    93  func packedNibbleLookupSSSE3Asm(dst, src, tablePtr unsafe.Pointer, nSrcByte int)
    94  
    95  //go:noescape
    96  func packedNibbleLookupOddSSSE3Asm(dst, src, tablePtr unsafe.Pointer, nSrcFullByte int)
    97  
    98  //go:noescape
    99  func interleave8SSE2Asm(dst, even, odd unsafe.Pointer, nDstByte int)
   100  
   101  //go:noescape
   102  func interleave8OddSSE2Asm(dst, even, odd unsafe.Pointer, nOddByte int)
   103  
   104  //go:noescape
   105  func reverse8InplaceSSSE3Asm(main unsafe.Pointer, nByte int)
   106  
   107  //go:noescape
   108  func reverse8SSSE3Asm(dst, src unsafe.Pointer, nByte int)
   109  
   110  //go:noescape
   111  func bitFromEveryByteSSE2Asm(dst, src unsafe.Pointer, lshift, nDstByte int)
   112  
   113  // *** end assembly function signatures
   114  
   115  func init() {
   116  	if !cpu.X86.HasSSE42 {
   117  		panic("SSE4.2 required.")
   118  	}
   119  	bytesPerVec = 16
   120  	log2BytesPerVec = 4
   121  }
   122  
   123  // BytesPerVec is an accessor for the bytesPerVec package variable.
   124  func BytesPerVec() int {
   125  	return bytesPerVec
   126  }
   127  
   128  // RoundUpPow2 returns val rounded up to a multiple of alignment, assuming
   129  // alignment is a power of 2.
   130  func RoundUpPow2(val, alignment int) int {
   131  	return (val + alignment - 1) & (^(alignment - 1))
   132  }
   133  
   134  // DivUpPow2 efficiently divides a number by a power-of-2 divisor.  (This works
   135  // for negative dividends since the language specifies arithmetic right-shifts
   136  // of signed numbers.  I'm pretty sure this doesn't have a performance
   137  // penalty.)
   138  func DivUpPow2(dividend, divisor int, log2Divisor uint) int {
   139  	return (dividend + divisor - 1) >> log2Divisor
   140  }
   141  
   142  // MakeUnsafe returns a byte slice of the given length which is guaranteed to
   143  // have enough capacity for all Unsafe functions in this package to work.  (It
   144  // is not itself an unsafe function: allocated memory is zero-initialized.)
   145  // Note that Unsafe functions occasionally have other caveats: e.g.
   146  // PopcntUnsafe also requires relevant bytes past the end of the slice to be
   147  // zeroed out.
   148  func MakeUnsafe(len int) []byte {
   149  	// Although no planned function requires more than
   150  	// RoundUpPow2(len+1, bytesPerVec) capacity, it is necessary to add
   151  	// bytesPerVec instead to make subslicing safe.
   152  	return make([]byte, len, len+bytesPerVec)
   153  }
   154  
   155  // RemakeUnsafe reuses the given buffer if it has sufficient capacity;
   156  // otherwise it does the same thing as MakeUnsafe.  It does NOT preserve
   157  // existing contents of buf[]; use ResizeUnsafe() for that.
   158  func RemakeUnsafe(bufptr *[]byte, len int) {
   159  	minCap := len + bytesPerVec
   160  	if minCap <= cap(*bufptr) {
   161  		*bufptr = (*bufptr)[:len]
   162  		return
   163  	}
   164  	// This is likely to be called in an inner loop processing variable-size
   165  	// inputs, so mild exponential growth is appropriate.
   166  	*bufptr = make([]byte, len, RoundUpPow2(minCap+(minCap/8), bytesPerVec))
   167  }
   168  
   169  // ResizeUnsafe changes the length of buf and ensures it has enough extra
   170  // capacity to be passed to this package's Unsafe functions.  Existing buf[]
   171  // contents are preserved (with possible truncation), though when length is
   172  // increased, new bytes might not be zero-initialized.
   173  func ResizeUnsafe(bufptr *[]byte, len int) {
   174  	minCap := len + bytesPerVec
   175  	if minCap <= cap(*bufptr) {
   176  		*bufptr = (*bufptr)[:len]
   177  		return
   178  	}
   179  	dst := make([]byte, len, RoundUpPow2(minCap+(minCap/8), bytesPerVec))
   180  	copy(dst, *bufptr)
   181  	*bufptr = dst
   182  }
   183  
   184  // XcapUnsafe is shorthand for ResizeUnsafe's most common use case (no length
   185  // change, just want to ensure sufficient capacity).
   186  func XcapUnsafe(bufptr *[]byte) {
   187  	ResizeUnsafe(bufptr, len(*bufptr))
   188  }
   189  
   190  // Memset8Unsafe sets all values of dst[] to the given byte.  (This is intended
   191  // for val != 0.  It is better to use a range-for loop for val == 0 since the
   192  // compiler has a hardcoded optimization for that case; see
   193  // https://github.com/golang/go/issues/5373 .)
   194  //
   195  // WARNING: This is a function designed to be used in inner loops, which
   196  // assumes without checking that capacity is at least RoundUpPow2(len(dst),
   197  // bytesPerVec).  It also assumes that the caller does not care if a few bytes
   198  // past the end of dst[] are changed.  Use the safe version of this function if
   199  // any of these properties are problematic.
   200  // These assumptions are always satisfied when the last
   201  // potentially-size-increasing operation on dst[] is {Make,Remake}Unsafe(),
   202  // ResizeUnsafe(), or XcapUnsafe().
   203  func Memset8Unsafe(dst []byte, val byte) {
   204  	dstHeader := (*reflect.SliceHeader)(unsafe.Pointer(&dst))
   205  	valWord := uintptr(0x0101010101010101) * uintptr(val)
   206  	// Compiler optimizes this well, my first attempt at a SSE implementation did
   207  	// not do better on my Mac, and neither did a non-AVX2 direct copy of
   208  	// runtime.memclr_amd64.
   209  	// With that said, benchmark against memclr reveals that AVX2 (and
   210  	// non-temporal stores in the >32 MiB case) makes a significant difference.
   211  	nWord := DivUpPow2(len(dst), BytesPerWord, Log2BytesPerWord)
   212  	dstWordsIter := unsafe.Pointer(dstHeader.Data)
   213  	for widx := 0; widx < nWord; widx++ {
   214  		*((*uintptr)(dstWordsIter)) = valWord
   215  		dstWordsIter = unsafe.Add(dstWordsIter, BytesPerWord)
   216  	}
   217  }
   218  
   219  // Memset8 sets all values of dst[] to the given byte.  (This is intended for
   220  // val != 0.  It is better to use a range-for loop for val == 0 since the
   221  // compiler has a hardcoded optimization for that case.)
   222  func Memset8(dst []byte, val byte) {
   223  	// This is ~2-8% slower than the unsafe version.
   224  	dstLen := len(dst)
   225  	if dstLen < BytesPerWord {
   226  		for pos := range dst {
   227  			dst[pos] = val
   228  		}
   229  		return
   230  	}
   231  	dstData := unsafe.Pointer((*reflect.SliceHeader)(unsafe.Pointer(&dst)).Data)
   232  	valWord := uintptr(0x0101010101010101) * uintptr(val)
   233  	nWordMinus1 := (dstLen - 1) >> Log2BytesPerWord
   234  	dstWordsIter := dstData
   235  	for widx := 0; widx < nWordMinus1; widx++ {
   236  		*((*uintptr)(dstWordsIter)) = valWord
   237  		dstWordsIter = unsafe.Add(dstWordsIter, BytesPerWord)
   238  	}
   239  	dstWordsIter = unsafe.Add(dstData, dstLen-BytesPerWord)
   240  	*((*uintptr)(dstWordsIter)) = valWord
   241  }
   242  
   243  // MakeNibbleLookupTable generates a NibbleLookupTable from a [16]byte.
   244  func MakeNibbleLookupTable(table [16]byte) (t NibbleLookupTable) {
   245  	for i := range t.shuffle {
   246  		t.shuffle[i] = table
   247  	}
   248  	return
   249  }
   250  
   251  // UnpackedNibbleLookupUnsafeInplace replaces the bytes in main[] as follows:
   252  //   if value < 128, set to table[value & 15]
   253  //   otherwise, set to 0
   254  //
   255  // WARNING: This is a function designed to be used in inner loops, which makes
   256  // assumptions about capacity which aren't checked at runtime.  Use the safe
   257  // version of this function when that's a problem.
   258  // These assumptions are always satisfied when the last
   259  // potentially-size-increasing operation on main[] is {Make,Remake}Unsafe(),
   260  // ResizeUnsafe(), or XcapUnsafe().
   261  //
   262  // 1. cap(main) must be at least RoundUpPow2(len(main) + 1, bytesPerVec).
   263  //
   264  // 2. The caller does not care if a few bytes past the end of main[] are
   265  // changed.
   266  func UnpackedNibbleLookupUnsafeInplace(main []byte, tablePtr *NibbleLookupTable) {
   267  	mainLen := len(main)
   268  	mainHeader := (*reflect.SliceHeader)(unsafe.Pointer(&main))
   269  	if mainLen <= 16 {
   270  		// originally just set mainLen = bytesPerVec and rejoined the main branch,
   271  		// but that produced noticeably worse benchmark results, even for the usual
   272  		// case.
   273  		unpackedNibbleLookupTinyInplaceSSSE3Asm(unsafe.Pointer(mainHeader.Data), unsafe.Pointer(tablePtr))
   274  		return
   275  	}
   276  	unpackedNibbleLookupOddInplaceSSSE3Asm(unsafe.Pointer(mainHeader.Data), unsafe.Pointer(tablePtr), mainLen)
   277  }
   278  
   279  // UnpackedNibbleLookupInplace replaces the bytes in main[] as follows:
   280  //   if value < 128, set to table[value & 15]
   281  //   otherwise, set to 0
   282  func UnpackedNibbleLookupInplace(main []byte, tablePtr *NibbleLookupTable) {
   283  	// May want to define variants of these functions which have undefined
   284  	// results for input values in [16, 128); this will be useful for
   285  	// cross-platform ARM/x86 code.
   286  	mainLen := len(main)
   287  	if mainLen < 16 {
   288  		// Tried copying to and from a [16]byte, overhead of that was too high.
   289  		// (I consider the poor performance of this case to be one of the strongest
   290  		// justifications for exporting Unsafe functions at all.)
   291  		for pos, curByte := range main {
   292  			if curByte < 128 {
   293  				curByte = tablePtr.Get(curByte & 15)
   294  			} else {
   295  				curByte = 0
   296  			}
   297  			main[pos] = curByte
   298  		}
   299  		return
   300  	}
   301  	mainHeader := (*reflect.SliceHeader)(unsafe.Pointer(&main))
   302  	unpackedNibbleLookupOddInplaceSSSE3Asm(unsafe.Pointer(mainHeader.Data), unsafe.Pointer(tablePtr), mainLen)
   303  }
   304  
   305  // UnpackedNibbleLookupUnsafe sets the bytes in dst[] as follows:
   306  //   if src[pos] < 128, set dst[pos] := table[src[pos] & 15]
   307  //   otherwise, set dst[pos] := 0
   308  //
   309  // WARNING: This is a function designed to be used in inner loops, which makes
   310  // assumptions about length and capacity which aren't checked at runtime.  Use
   311  // the safe version of this function when that's a problem.
   312  // Assumptions #2-3 are always satisfied when the last
   313  // potentially-size-increasing operation on src[] is {Re}makeUnsafe(),
   314  // ResizeUnsafe(), or XcapUnsafe(), and the same is true for dst[].
   315  //
   316  // 1. len(src) and len(dst) are equal.
   317  //
   318  // 2. Capacities are at least RoundUpPow2(len(src) + 1, bytesPerVec).
   319  //
   320  // 3. The caller does not care if a few bytes past the end of dst[] are
   321  // changed.
   322  func UnpackedNibbleLookupUnsafe(dst, src []byte, tablePtr *NibbleLookupTable) {
   323  	srcHeader := (*reflect.SliceHeader)(unsafe.Pointer(&src))
   324  	dstHeader := (*reflect.SliceHeader)(unsafe.Pointer(&dst))
   325  	unpackedNibbleLookupSSSE3Asm(unsafe.Pointer(dstHeader.Data), unsafe.Pointer(srcHeader.Data), unsafe.Pointer(tablePtr), srcHeader.Len)
   326  }
   327  
   328  // UnpackedNibbleLookup sets the bytes in dst[] as follows:
   329  //   if src[pos] < 128, set dst[pos] := table[src[pos] & 15]
   330  //   otherwise, set dst[pos] := 0
   331  // It panics if len(src) != len(dst).
   332  func UnpackedNibbleLookup(dst, src []byte, tablePtr *NibbleLookupTable) {
   333  	srcLen := len(src)
   334  	if len(dst) != srcLen {
   335  		panic("UnpackedNibbleLookup() requires len(src) == len(dst).")
   336  	}
   337  	if srcLen < 16 {
   338  		for pos, curByte := range src {
   339  			if curByte < 128 {
   340  				curByte = tablePtr.Get(curByte & 15)
   341  			} else {
   342  				curByte = 0
   343  			}
   344  			dst[pos] = curByte
   345  		}
   346  		return
   347  	}
   348  	srcHeader := (*reflect.SliceHeader)(unsafe.Pointer(&src))
   349  	dstHeader := (*reflect.SliceHeader)(unsafe.Pointer(&dst))
   350  	unpackedNibbleLookupOddSSSE3Asm(unsafe.Pointer(dstHeader.Data), unsafe.Pointer(srcHeader.Data), unsafe.Pointer(tablePtr), srcLen)
   351  }
   352  
   353  // UnpackedNibbleLookupS is a variant of UnpackedNibbleLookup() that takes
   354  // string src.
   355  func UnpackedNibbleLookupS(dst []byte, src string, tablePtr *NibbleLookupTable) {
   356  	srcLen := len(src)
   357  	if len(dst) != srcLen {
   358  		panic("UnpackedNibbleLookupS() requires len(src) == len(dst).")
   359  	}
   360  	if srcLen < 16 {
   361  		for pos := range src {
   362  			curByte := src[pos]
   363  			if curByte < 128 {
   364  				curByte = tablePtr.Get(curByte & 15)
   365  			} else {
   366  				curByte = 0
   367  			}
   368  			dst[pos] = curByte
   369  		}
   370  		return
   371  	}
   372  	srcHeader := (*reflect.StringHeader)(unsafe.Pointer(&src))
   373  	dstHeader := (*reflect.SliceHeader)(unsafe.Pointer(&dst))
   374  	unpackedNibbleLookupOddSSSE3Asm(unsafe.Pointer(dstHeader.Data), unsafe.Pointer(srcHeader.Data), unsafe.Pointer(tablePtr), srcLen)
   375  }
   376  
   377  // PackedNibbleLookupUnsafe sets the bytes in dst[] as follows:
   378  //   if pos is even, dst[pos] := table[src[pos / 2] & 15]
   379  //   if pos is odd, dst[pos] := table[src[pos / 2] >> 4]
   380  //
   381  // WARNING: This is a function designed to be used in inner loops, which makes
   382  // assumptions about length and capacity which aren't checked at runtime.  Use
   383  // the safe version of this function when that's a problem.
   384  // Assumptions #2-#3 are always satisfied when the last
   385  // potentially-size-increasing operation on src[] is {Re}makeUnsafe(),
   386  // ResizeUnsafe(), or XcapUnsafe(), and the same is true for dst[].
   387  //
   388  // 1. len(src) == (len(dst) + 1) / 2.
   389  //
   390  // 2. Capacity of src is at least RoundUpPow2(len(src) + 1, bytesPerVec), and
   391  // the same is true for dst.
   392  //
   393  // 3. The caller does not care if a few bytes past the end of dst[] are
   394  // changed.
   395  func PackedNibbleLookupUnsafe(dst, src []byte, tablePtr *NibbleLookupTable) {
   396  	// Note that this is not the correct order for .bam seq[] unpacking; use
   397  	// biosimd.UnpackAndReplaceSeqUnsafe() for that.
   398  	srcHeader := (*reflect.SliceHeader)(unsafe.Pointer(&src))
   399  	dstHeader := (*reflect.SliceHeader)(unsafe.Pointer(&dst))
   400  	packedNibbleLookupSSSE3Asm(unsafe.Pointer(dstHeader.Data), unsafe.Pointer(srcHeader.Data), unsafe.Pointer(tablePtr), srcHeader.Len)
   401  }
   402  
   403  // PackedNibbleLookup sets the bytes in dst[] as follows:
   404  //   if pos is even, dst[pos] := table[src[pos / 2] & 15]
   405  //   if pos is odd, dst[pos] := table[src[pos / 2] >> 4]
   406  // It panics if len(src) != (len(dst) + 1) / 2.
   407  //
   408  // Nothing bad happens if len(dst) is odd and some high bits in the last src[]
   409  // byte are set, though it's generally good practice to ensure that case
   410  // doesn't come up.
   411  func PackedNibbleLookup(dst, src []byte, tablePtr *NibbleLookupTable) {
   412  	// This takes ~15% longer than the unsafe function on the short-array
   413  	// benchmark.
   414  	dstLen := len(dst)
   415  	nSrcFullByte := dstLen >> 1
   416  	srcOdd := dstLen & 1
   417  	if len(src) != nSrcFullByte+srcOdd {
   418  		panic("PackedNibbleLookup() requires len(src) == (len(dst) + 1) / 2.")
   419  	}
   420  	if nSrcFullByte < 16 {
   421  		for srcPos := 0; srcPos < nSrcFullByte; srcPos++ {
   422  			srcByte := src[srcPos]
   423  			dst[2*srcPos] = tablePtr.Get(srcByte & 15)
   424  			dst[2*srcPos+1] = tablePtr.Get(srcByte >> 4)
   425  		}
   426  	} else {
   427  		srcHeader := (*reflect.SliceHeader)(unsafe.Pointer(&src))
   428  		dstHeader := (*reflect.SliceHeader)(unsafe.Pointer(&dst))
   429  		packedNibbleLookupOddSSSE3Asm(unsafe.Pointer(dstHeader.Data), unsafe.Pointer(srcHeader.Data), unsafe.Pointer(tablePtr), nSrcFullByte)
   430  	}
   431  	if srcOdd == 1 {
   432  		srcByte := src[nSrcFullByte]
   433  		dst[2*nSrcFullByte] = tablePtr.Get(srcByte & 15)
   434  	}
   435  }
   436  
   437  // Interleave8Unsafe sets the bytes in dst[] as follows:
   438  //   if pos is even, dst[pos] := even[pos/2]
   439  //   if pos is odd, dst[pos] := odd[pos/2]
   440  //
   441  // WARNING: This is a function designed to be used in inner loops, which makes
   442  // assumptions about length and capacity which aren't checked at runtime.  Use
   443  // the safe version of this function when that's a problem.
   444  // Assumptions #2-3 are always satisfied when the last
   445  // potentially-size-increasing operation on dst[] is {Re}makeUnsafe(),
   446  // ResizeUnsafe(), or XcapUnsafe(), and the same is true for even[] and odd[].
   447  //
   448  // 1. len(even) = (len(dst) + 1) / 2, and len(odd) = len(dst) / 2.
   449  //
   450  // 2. cap(dst) >= RoundUpPow2(len(dst) + 1, bytesPerVec),
   451  // cap(even) >= RoundUpPow2(len(even) + 1, bytesPerVec), and
   452  // cap(odd) >= RoundUpPow2(len(odd) + 1, bytesPerVec).
   453  //
   454  // 3. The caller does not care if a few bytes past the end of dst[] are
   455  // changed.
   456  func Interleave8Unsafe(dst, even, odd []byte) {
   457  	dstHeader := (*reflect.SliceHeader)(unsafe.Pointer(&dst))
   458  	evenHeader := (*reflect.SliceHeader)(unsafe.Pointer(&even))
   459  	oddHeader := (*reflect.SliceHeader)(unsafe.Pointer(&odd))
   460  	interleave8SSE2Asm(unsafe.Pointer(dstHeader.Data), unsafe.Pointer(evenHeader.Data), unsafe.Pointer(oddHeader.Data), dstHeader.Len)
   461  }
   462  
   463  // Interleave8 sets the bytes in dst[] as follows:
   464  //   if pos is even, dst[pos] := even[pos/2]
   465  //   if pos is odd, dst[pos] := odd[pos/2]
   466  // It panics if ((len(dst) + 1) / 2) != len(even), or (len(dst) / 2) !=
   467  // len(odd).
   468  func Interleave8(dst, even, odd []byte) {
   469  	// This is ~6-20% slower than the unsafe function on the short-array
   470  	// benchmark.
   471  	dstLen := len(dst)
   472  	evenLen := (dstLen + 1) >> 1
   473  	oddLen := dstLen >> 1
   474  	if (len(even) != evenLen) || (len(odd) != oddLen) {
   475  		panic("Interleave8() requires len(even) == len(dst) + 1) / 2, and len(odd) == len(dst) / 2.")
   476  	}
   477  	if oddLen < 16 {
   478  		for idx, oddByte := range odd {
   479  			dst[2*idx] = even[idx]
   480  			dst[2*idx+1] = oddByte
   481  		}
   482  	} else {
   483  		dstHeader := (*reflect.SliceHeader)(unsafe.Pointer(&dst))
   484  		evenHeader := (*reflect.SliceHeader)(unsafe.Pointer(&even))
   485  		oddHeader := (*reflect.SliceHeader)(unsafe.Pointer(&odd))
   486  		interleave8OddSSE2Asm(unsafe.Pointer(dstHeader.Data), unsafe.Pointer(evenHeader.Data), unsafe.Pointer(oddHeader.Data), oddLen)
   487  	}
   488  	if oddLen != evenLen {
   489  		dst[oddLen*2] = even[oddLen]
   490  	}
   491  }
   492  
   493  // Reverse8Inplace reverses the bytes in main[].  (There is no unsafe version
   494  // of this function.)
   495  func Reverse8Inplace(main []byte) {
   496  	mainHeader := (*reflect.SliceHeader)(unsafe.Pointer(&main))
   497  	reverse8InplaceSSSE3Asm(unsafe.Pointer(mainHeader.Data), mainHeader.Len)
   498  }
   499  
   500  // Reverse8Unsafe sets dst[pos] := src[len(src) - 1 - pos] for every position
   501  // in src.
   502  //
   503  // WARNING: This does not verify len(dst) == len(src); call the safe version of
   504  // this function if you want that.
   505  func Reverse8Unsafe(dst, src []byte) {
   506  	nByte := len(src)
   507  	if nByte < BytesPerWord {
   508  		// could use bswap32 on two uint32s if nByte in 4..7
   509  		nByteMinus1 := nByte - 1
   510  		for idx := 0; idx != nByte; idx++ {
   511  			dst[nByteMinus1-idx] = src[idx]
   512  		}
   513  		return
   514  	}
   515  	srcData := unsafe.Pointer((*reflect.SliceHeader)(unsafe.Pointer(&src)).Data)
   516  	dstData := unsafe.Pointer((*reflect.SliceHeader)(unsafe.Pointer(&dst)).Data)
   517  	if nByte < 16 {
   518  		// use bswap64 on a word at a time
   519  		nWordMinus1 := (nByte - 1) >> Log2BytesPerWord
   520  		finalOffset := uintptr(nByte) - BytesPerWord
   521  		srcIter := unsafe.Add(srcData, finalOffset)
   522  		dstIter := dstData
   523  		for widx := 0; widx < nWordMinus1; widx++ {
   524  			srcWord := *((*uintptr)(srcIter))
   525  			*((*uintptr)(dstIter)) = uintptr(bits.ReverseBytes64(uint64(srcWord)))
   526  			srcIter = unsafe.Add(srcIter, -BytesPerWord)
   527  			dstIter = unsafe.Add(dstIter, -BytesPerWord)
   528  		}
   529  		srcFirstWordPtr := unsafe.Pointer(srcData)
   530  		dstLastWordPtr := unsafe.Add(dstData, finalOffset)
   531  		srcWord := *((*uintptr)(srcFirstWordPtr))
   532  		*((*uintptr)(dstLastWordPtr)) = uintptr(bits.ReverseBytes64(uint64(srcWord)))
   533  		return
   534  	}
   535  	reverse8SSSE3Asm(dstData, srcData, nByte)
   536  }
   537  
   538  // Reverse8 sets dst[pos] := src[len(src) - 1 - pos] for every position in src.
   539  // It panics if len(src) != len(dst).
   540  func Reverse8(dst, src []byte) {
   541  	nByte := len(src)
   542  	if nByte != len(dst) {
   543  		panic("Reverse8() requires len(src) == len(dst).")
   544  	}
   545  	if nByte < BytesPerWord {
   546  		// could use bswap32 on two uint32s if nByte in 4..7
   547  		nByteMinus1 := nByte - 1
   548  		for idx := 0; idx != nByte; idx++ {
   549  			dst[nByteMinus1-idx] = src[idx]
   550  		}
   551  		return
   552  	}
   553  	srcData := unsafe.Pointer((*reflect.SliceHeader)(unsafe.Pointer(&src)).Data)
   554  	dstData := unsafe.Pointer((*reflect.SliceHeader)(unsafe.Pointer(&dst)).Data)
   555  	if nByte < 16 {
   556  		// use bswap64 on a word at a time
   557  		nWordMinus1 := (nByte - 1) >> Log2BytesPerWord
   558  		finalOffset := uintptr(nByte) - BytesPerWord
   559  		srcIter := unsafe.Add(srcData, finalOffset)
   560  		dstIter := dstData
   561  		for widx := 0; widx < nWordMinus1; widx++ {
   562  			srcWord := *((*uintptr)(srcIter))
   563  			*((*uintptr)(dstIter)) = uintptr(bits.ReverseBytes64(uint64(srcWord)))
   564  			srcIter = unsafe.Add(srcIter, -BytesPerWord)
   565  			dstIter = unsafe.Add(dstIter, -BytesPerWord)
   566  		}
   567  		srcFirstWordPtr := srcData
   568  		dstLastWordPtr := unsafe.Add(dstData, finalOffset)
   569  		srcWord := *((*uintptr)(srcFirstWordPtr))
   570  		*((*uintptr)(dstLastWordPtr)) = uintptr(bits.ReverseBytes64(uint64(srcWord)))
   571  		return
   572  	}
   573  	reverse8SSSE3Asm(dstData, srcData, nByte)
   574  }
   575  
   576  // BitFromEveryByte fills dst[] with a bitarray containing every 8th bit from
   577  // src[], starting with bitIdx, where bitIdx is in [0,7].  If len(src) is not
   578  // divisible by 8, extra bits in the last filled byte of dst are set to zero.
   579  //
   580  // For example, if src[] is
   581  //   0x1f 0x33 0x0d 0x00 0x51 0xcc 0x34 0x59 0x44
   582  // and bitIdx is 2, bit 2 from every byte is
   583  //      1    0    1    0    0    1    1    0    1
   584  // so dst[] is filled with
   585  //   0x65 0x01.
   586  //
   587  // - It panics if len(dst) < (len(src) + 7) / 8, or bitIdx isn't in [0,7].
   588  // - If dst is larger than necessary, the extra bytes are not changed.
   589  func BitFromEveryByte(dst, src []byte, bitIdx int) {
   590  	requiredDstLen := (len(src) + 7) >> 3
   591  	if (len(dst) < requiredDstLen) || (uint(bitIdx) > 7) {
   592  		panic("BitFromEveryByte requires len(dst) >= (len(src) + 7) / 8 and 0 <= bitIdx < 8.")
   593  	}
   594  	nSrcVecByte := len(src) &^ (bytesPerVec - 1)
   595  	if nSrcVecByte != 0 {
   596  		bitFromEveryByteSSE2Asm(unsafe.Pointer(&dst[0]), unsafe.Pointer(&src[0]), 7-bitIdx, nSrcVecByte>>3)
   597  	}
   598  	remainder := len(src) - nSrcVecByte
   599  	if remainder != 0 {
   600  		// Not optimized since it isn't expected to matter.
   601  		srcLast := src[nSrcVecByte:]
   602  		dstLast := dst[nSrcVecByte>>3 : requiredDstLen]
   603  		for i := range dstLast {
   604  			dstLast[i] = 0
   605  		}
   606  		for i, b := range srcLast {
   607  			dstLast[i>>3] |= ((b >> uint32(bitIdx)) & 1) << uint32(i&7)
   608  		}
   609  	}
   610  }