github.com/grailbio/base@v0.0.11/simd/simd_amd64.go (about) 1 // Copyright 2021 GRAIL, Inc. All rights reserved. 2 // Use of this source code is governed by the Apache-2.0 3 // license that can be found in the LICENSE file. 4 5 //go:build amd64 && !appengine 6 // +build amd64,!appengine 7 8 package simd 9 10 import ( 11 "math/bits" 12 "reflect" 13 "unsafe" 14 15 "golang.org/x/sys/cpu" 16 ) 17 18 // amd64 compile-time constants. 19 20 // BytesPerWord is the number of bytes in a machine word. 21 // We don't use unsafe.Sizeof(uintptr(1)) since there are advantages to having 22 // this as an untyped constant, and there's essentially no drawback since this 23 // is an _amd64-specific file. 24 const BytesPerWord = 8 25 26 // Log2BytesPerWord is log2(BytesPerWord). This is relevant for manual 27 // bit-shifting when we know that's a safe way to divide and the compiler does 28 // not (e.g. dividend is of signed int type). 29 const Log2BytesPerWord = uint(3) 30 31 // BitsPerWord is the number of bits in a machine word. 32 const BitsPerWord = BytesPerWord * 8 33 34 // This must be at least <maximum supported vector size> / 16. 35 const nibbleLookupDup = 1 36 37 // NibbleLookupTable represents a parallel-byte-substitution operation f, where 38 // every byte b in a byte-slice is replaced with 39 // f(b) := shuffle[0][b & 15] for b <= 127, and 40 // f(b) := 0 for b > 127. 41 // (The second part is usually irrelevant in practice, but must be defined this 42 // way to allow _mm_shuffle_epi8()/_mm256_shuffle_epi8()/_mm512_shuffle_epi8() 43 // to be used to implement the operation efficiently.) 44 // It's named NibbleLookupTable rather than ByteLookupTable since only the 45 // bottom nibble of each byte can be used for table lookup. 46 // It potentially stores multiple adjacent copies of the lookup table since 47 // that speeds up the AVX2 and AVX-512 use cases (the table can be loaded with 48 // a single _mm256_loadu_si256 operation, instead of e.g. _mm_loadu_si128 49 // followed by _mm256_set_m128i with the same argument twice), and the typical 50 // use case involves initializing very few tables and using them many, many 51 // times. 52 type NibbleLookupTable struct { 53 shuffle [nibbleLookupDup][16]byte 54 } 55 56 // Get performs the b <= 127 part of the lookup operation described above. 57 // The b > 127 branch is omitted because in many use cases (e.g. 58 // PackedNibbleLookup below), it can be proven that b > 127 is impossible, and 59 // removing the if-statement is a significant performance win when it's 60 // possible. 61 func (t *NibbleLookupTable) Get(b byte) byte { 62 return t.shuffle[0][b] 63 } 64 65 // const minPageSize = 4096 may be relevant for safe functions soon. 66 67 // bytesPerVec is the size of the maximum-width vector that may be used. It is 68 // at least 16, but will soon be set to 32 if AVX2 support is detected. It 69 // may be set to 64 in the future when AVX-512 is detected. 70 var bytesPerVec int 71 72 // log2BytesPerVec supports efficient division by bytesPerVec. 73 var log2BytesPerVec uint 74 75 // *** the following functions are defined in simd_amd64.s 76 77 // There was a unpackedNibbleLookupInplaceSSSE3Asm function here, but it 78 // actually benchmarked worse than the general-case function. 79 80 //go:noescape 81 func unpackedNibbleLookupTinyInplaceSSSE3Asm(main, tablePtr unsafe.Pointer) 82 83 //go:noescape 84 func unpackedNibbleLookupOddInplaceSSSE3Asm(main, tablePtr unsafe.Pointer, nByte int) 85 86 //go:noescape 87 func unpackedNibbleLookupSSSE3Asm(dst, src, tablePtr unsafe.Pointer, nByte int) 88 89 //go:noescape 90 func unpackedNibbleLookupOddSSSE3Asm(dst, src, tablePtr unsafe.Pointer, nByte int) 91 92 //go:noescape 93 func packedNibbleLookupSSSE3Asm(dst, src, tablePtr unsafe.Pointer, nSrcByte int) 94 95 //go:noescape 96 func packedNibbleLookupOddSSSE3Asm(dst, src, tablePtr unsafe.Pointer, nSrcFullByte int) 97 98 //go:noescape 99 func interleave8SSE2Asm(dst, even, odd unsafe.Pointer, nDstByte int) 100 101 //go:noescape 102 func interleave8OddSSE2Asm(dst, even, odd unsafe.Pointer, nOddByte int) 103 104 //go:noescape 105 func reverse8InplaceSSSE3Asm(main unsafe.Pointer, nByte int) 106 107 //go:noescape 108 func reverse8SSSE3Asm(dst, src unsafe.Pointer, nByte int) 109 110 //go:noescape 111 func bitFromEveryByteSSE2Asm(dst, src unsafe.Pointer, lshift, nDstByte int) 112 113 // *** end assembly function signatures 114 115 func init() { 116 if !cpu.X86.HasSSE42 { 117 panic("SSE4.2 required.") 118 } 119 bytesPerVec = 16 120 log2BytesPerVec = 4 121 } 122 123 // BytesPerVec is an accessor for the bytesPerVec package variable. 124 func BytesPerVec() int { 125 return bytesPerVec 126 } 127 128 // RoundUpPow2 returns val rounded up to a multiple of alignment, assuming 129 // alignment is a power of 2. 130 func RoundUpPow2(val, alignment int) int { 131 return (val + alignment - 1) & (^(alignment - 1)) 132 } 133 134 // DivUpPow2 efficiently divides a number by a power-of-2 divisor. (This works 135 // for negative dividends since the language specifies arithmetic right-shifts 136 // of signed numbers. I'm pretty sure this doesn't have a performance 137 // penalty.) 138 func DivUpPow2(dividend, divisor int, log2Divisor uint) int { 139 return (dividend + divisor - 1) >> log2Divisor 140 } 141 142 // MakeUnsafe returns a byte slice of the given length which is guaranteed to 143 // have enough capacity for all Unsafe functions in this package to work. (It 144 // is not itself an unsafe function: allocated memory is zero-initialized.) 145 // Note that Unsafe functions occasionally have other caveats: e.g. 146 // PopcntUnsafe also requires relevant bytes past the end of the slice to be 147 // zeroed out. 148 func MakeUnsafe(len int) []byte { 149 // Although no planned function requires more than 150 // RoundUpPow2(len+1, bytesPerVec) capacity, it is necessary to add 151 // bytesPerVec instead to make subslicing safe. 152 return make([]byte, len, len+bytesPerVec) 153 } 154 155 // RemakeUnsafe reuses the given buffer if it has sufficient capacity; 156 // otherwise it does the same thing as MakeUnsafe. It does NOT preserve 157 // existing contents of buf[]; use ResizeUnsafe() for that. 158 func RemakeUnsafe(bufptr *[]byte, len int) { 159 minCap := len + bytesPerVec 160 if minCap <= cap(*bufptr) { 161 *bufptr = (*bufptr)[:len] 162 return 163 } 164 // This is likely to be called in an inner loop processing variable-size 165 // inputs, so mild exponential growth is appropriate. 166 *bufptr = make([]byte, len, RoundUpPow2(minCap+(minCap/8), bytesPerVec)) 167 } 168 169 // ResizeUnsafe changes the length of buf and ensures it has enough extra 170 // capacity to be passed to this package's Unsafe functions. Existing buf[] 171 // contents are preserved (with possible truncation), though when length is 172 // increased, new bytes might not be zero-initialized. 173 func ResizeUnsafe(bufptr *[]byte, len int) { 174 minCap := len + bytesPerVec 175 if minCap <= cap(*bufptr) { 176 *bufptr = (*bufptr)[:len] 177 return 178 } 179 dst := make([]byte, len, RoundUpPow2(minCap+(minCap/8), bytesPerVec)) 180 copy(dst, *bufptr) 181 *bufptr = dst 182 } 183 184 // XcapUnsafe is shorthand for ResizeUnsafe's most common use case (no length 185 // change, just want to ensure sufficient capacity). 186 func XcapUnsafe(bufptr *[]byte) { 187 ResizeUnsafe(bufptr, len(*bufptr)) 188 } 189 190 // Memset8Unsafe sets all values of dst[] to the given byte. (This is intended 191 // for val != 0. It is better to use a range-for loop for val == 0 since the 192 // compiler has a hardcoded optimization for that case; see 193 // https://github.com/golang/go/issues/5373 .) 194 // 195 // WARNING: This is a function designed to be used in inner loops, which 196 // assumes without checking that capacity is at least RoundUpPow2(len(dst), 197 // bytesPerVec). It also assumes that the caller does not care if a few bytes 198 // past the end of dst[] are changed. Use the safe version of this function if 199 // any of these properties are problematic. 200 // These assumptions are always satisfied when the last 201 // potentially-size-increasing operation on dst[] is {Make,Remake}Unsafe(), 202 // ResizeUnsafe(), or XcapUnsafe(). 203 func Memset8Unsafe(dst []byte, val byte) { 204 dstHeader := (*reflect.SliceHeader)(unsafe.Pointer(&dst)) 205 valWord := uintptr(0x0101010101010101) * uintptr(val) 206 // Compiler optimizes this well, my first attempt at a SSE implementation did 207 // not do better on my Mac, and neither did a non-AVX2 direct copy of 208 // runtime.memclr_amd64. 209 // With that said, benchmark against memclr reveals that AVX2 (and 210 // non-temporal stores in the >32 MiB case) makes a significant difference. 211 nWord := DivUpPow2(len(dst), BytesPerWord, Log2BytesPerWord) 212 dstWordsIter := unsafe.Pointer(dstHeader.Data) 213 for widx := 0; widx < nWord; widx++ { 214 *((*uintptr)(dstWordsIter)) = valWord 215 dstWordsIter = unsafe.Add(dstWordsIter, BytesPerWord) 216 } 217 } 218 219 // Memset8 sets all values of dst[] to the given byte. (This is intended for 220 // val != 0. It is better to use a range-for loop for val == 0 since the 221 // compiler has a hardcoded optimization for that case.) 222 func Memset8(dst []byte, val byte) { 223 // This is ~2-8% slower than the unsafe version. 224 dstLen := len(dst) 225 if dstLen < BytesPerWord { 226 for pos := range dst { 227 dst[pos] = val 228 } 229 return 230 } 231 dstData := unsafe.Pointer((*reflect.SliceHeader)(unsafe.Pointer(&dst)).Data) 232 valWord := uintptr(0x0101010101010101) * uintptr(val) 233 nWordMinus1 := (dstLen - 1) >> Log2BytesPerWord 234 dstWordsIter := dstData 235 for widx := 0; widx < nWordMinus1; widx++ { 236 *((*uintptr)(dstWordsIter)) = valWord 237 dstWordsIter = unsafe.Add(dstWordsIter, BytesPerWord) 238 } 239 dstWordsIter = unsafe.Add(dstData, dstLen-BytesPerWord) 240 *((*uintptr)(dstWordsIter)) = valWord 241 } 242 243 // MakeNibbleLookupTable generates a NibbleLookupTable from a [16]byte. 244 func MakeNibbleLookupTable(table [16]byte) (t NibbleLookupTable) { 245 for i := range t.shuffle { 246 t.shuffle[i] = table 247 } 248 return 249 } 250 251 // UnpackedNibbleLookupUnsafeInplace replaces the bytes in main[] as follows: 252 // if value < 128, set to table[value & 15] 253 // otherwise, set to 0 254 // 255 // WARNING: This is a function designed to be used in inner loops, which makes 256 // assumptions about capacity which aren't checked at runtime. Use the safe 257 // version of this function when that's a problem. 258 // These assumptions are always satisfied when the last 259 // potentially-size-increasing operation on main[] is {Make,Remake}Unsafe(), 260 // ResizeUnsafe(), or XcapUnsafe(). 261 // 262 // 1. cap(main) must be at least RoundUpPow2(len(main) + 1, bytesPerVec). 263 // 264 // 2. The caller does not care if a few bytes past the end of main[] are 265 // changed. 266 func UnpackedNibbleLookupUnsafeInplace(main []byte, tablePtr *NibbleLookupTable) { 267 mainLen := len(main) 268 mainHeader := (*reflect.SliceHeader)(unsafe.Pointer(&main)) 269 if mainLen <= 16 { 270 // originally just set mainLen = bytesPerVec and rejoined the main branch, 271 // but that produced noticeably worse benchmark results, even for the usual 272 // case. 273 unpackedNibbleLookupTinyInplaceSSSE3Asm(unsafe.Pointer(mainHeader.Data), unsafe.Pointer(tablePtr)) 274 return 275 } 276 unpackedNibbleLookupOddInplaceSSSE3Asm(unsafe.Pointer(mainHeader.Data), unsafe.Pointer(tablePtr), mainLen) 277 } 278 279 // UnpackedNibbleLookupInplace replaces the bytes in main[] as follows: 280 // if value < 128, set to table[value & 15] 281 // otherwise, set to 0 282 func UnpackedNibbleLookupInplace(main []byte, tablePtr *NibbleLookupTable) { 283 // May want to define variants of these functions which have undefined 284 // results for input values in [16, 128); this will be useful for 285 // cross-platform ARM/x86 code. 286 mainLen := len(main) 287 if mainLen < 16 { 288 // Tried copying to and from a [16]byte, overhead of that was too high. 289 // (I consider the poor performance of this case to be one of the strongest 290 // justifications for exporting Unsafe functions at all.) 291 for pos, curByte := range main { 292 if curByte < 128 { 293 curByte = tablePtr.Get(curByte & 15) 294 } else { 295 curByte = 0 296 } 297 main[pos] = curByte 298 } 299 return 300 } 301 mainHeader := (*reflect.SliceHeader)(unsafe.Pointer(&main)) 302 unpackedNibbleLookupOddInplaceSSSE3Asm(unsafe.Pointer(mainHeader.Data), unsafe.Pointer(tablePtr), mainLen) 303 } 304 305 // UnpackedNibbleLookupUnsafe sets the bytes in dst[] as follows: 306 // if src[pos] < 128, set dst[pos] := table[src[pos] & 15] 307 // otherwise, set dst[pos] := 0 308 // 309 // WARNING: This is a function designed to be used in inner loops, which makes 310 // assumptions about length and capacity which aren't checked at runtime. Use 311 // the safe version of this function when that's a problem. 312 // Assumptions #2-3 are always satisfied when the last 313 // potentially-size-increasing operation on src[] is {Re}makeUnsafe(), 314 // ResizeUnsafe(), or XcapUnsafe(), and the same is true for dst[]. 315 // 316 // 1. len(src) and len(dst) are equal. 317 // 318 // 2. Capacities are at least RoundUpPow2(len(src) + 1, bytesPerVec). 319 // 320 // 3. The caller does not care if a few bytes past the end of dst[] are 321 // changed. 322 func UnpackedNibbleLookupUnsafe(dst, src []byte, tablePtr *NibbleLookupTable) { 323 srcHeader := (*reflect.SliceHeader)(unsafe.Pointer(&src)) 324 dstHeader := (*reflect.SliceHeader)(unsafe.Pointer(&dst)) 325 unpackedNibbleLookupSSSE3Asm(unsafe.Pointer(dstHeader.Data), unsafe.Pointer(srcHeader.Data), unsafe.Pointer(tablePtr), srcHeader.Len) 326 } 327 328 // UnpackedNibbleLookup sets the bytes in dst[] as follows: 329 // if src[pos] < 128, set dst[pos] := table[src[pos] & 15] 330 // otherwise, set dst[pos] := 0 331 // It panics if len(src) != len(dst). 332 func UnpackedNibbleLookup(dst, src []byte, tablePtr *NibbleLookupTable) { 333 srcLen := len(src) 334 if len(dst) != srcLen { 335 panic("UnpackedNibbleLookup() requires len(src) == len(dst).") 336 } 337 if srcLen < 16 { 338 for pos, curByte := range src { 339 if curByte < 128 { 340 curByte = tablePtr.Get(curByte & 15) 341 } else { 342 curByte = 0 343 } 344 dst[pos] = curByte 345 } 346 return 347 } 348 srcHeader := (*reflect.SliceHeader)(unsafe.Pointer(&src)) 349 dstHeader := (*reflect.SliceHeader)(unsafe.Pointer(&dst)) 350 unpackedNibbleLookupOddSSSE3Asm(unsafe.Pointer(dstHeader.Data), unsafe.Pointer(srcHeader.Data), unsafe.Pointer(tablePtr), srcLen) 351 } 352 353 // UnpackedNibbleLookupS is a variant of UnpackedNibbleLookup() that takes 354 // string src. 355 func UnpackedNibbleLookupS(dst []byte, src string, tablePtr *NibbleLookupTable) { 356 srcLen := len(src) 357 if len(dst) != srcLen { 358 panic("UnpackedNibbleLookupS() requires len(src) == len(dst).") 359 } 360 if srcLen < 16 { 361 for pos := range src { 362 curByte := src[pos] 363 if curByte < 128 { 364 curByte = tablePtr.Get(curByte & 15) 365 } else { 366 curByte = 0 367 } 368 dst[pos] = curByte 369 } 370 return 371 } 372 srcHeader := (*reflect.StringHeader)(unsafe.Pointer(&src)) 373 dstHeader := (*reflect.SliceHeader)(unsafe.Pointer(&dst)) 374 unpackedNibbleLookupOddSSSE3Asm(unsafe.Pointer(dstHeader.Data), unsafe.Pointer(srcHeader.Data), unsafe.Pointer(tablePtr), srcLen) 375 } 376 377 // PackedNibbleLookupUnsafe sets the bytes in dst[] as follows: 378 // if pos is even, dst[pos] := table[src[pos / 2] & 15] 379 // if pos is odd, dst[pos] := table[src[pos / 2] >> 4] 380 // 381 // WARNING: This is a function designed to be used in inner loops, which makes 382 // assumptions about length and capacity which aren't checked at runtime. Use 383 // the safe version of this function when that's a problem. 384 // Assumptions #2-#3 are always satisfied when the last 385 // potentially-size-increasing operation on src[] is {Re}makeUnsafe(), 386 // ResizeUnsafe(), or XcapUnsafe(), and the same is true for dst[]. 387 // 388 // 1. len(src) == (len(dst) + 1) / 2. 389 // 390 // 2. Capacity of src is at least RoundUpPow2(len(src) + 1, bytesPerVec), and 391 // the same is true for dst. 392 // 393 // 3. The caller does not care if a few bytes past the end of dst[] are 394 // changed. 395 func PackedNibbleLookupUnsafe(dst, src []byte, tablePtr *NibbleLookupTable) { 396 // Note that this is not the correct order for .bam seq[] unpacking; use 397 // biosimd.UnpackAndReplaceSeqUnsafe() for that. 398 srcHeader := (*reflect.SliceHeader)(unsafe.Pointer(&src)) 399 dstHeader := (*reflect.SliceHeader)(unsafe.Pointer(&dst)) 400 packedNibbleLookupSSSE3Asm(unsafe.Pointer(dstHeader.Data), unsafe.Pointer(srcHeader.Data), unsafe.Pointer(tablePtr), srcHeader.Len) 401 } 402 403 // PackedNibbleLookup sets the bytes in dst[] as follows: 404 // if pos is even, dst[pos] := table[src[pos / 2] & 15] 405 // if pos is odd, dst[pos] := table[src[pos / 2] >> 4] 406 // It panics if len(src) != (len(dst) + 1) / 2. 407 // 408 // Nothing bad happens if len(dst) is odd and some high bits in the last src[] 409 // byte are set, though it's generally good practice to ensure that case 410 // doesn't come up. 411 func PackedNibbleLookup(dst, src []byte, tablePtr *NibbleLookupTable) { 412 // This takes ~15% longer than the unsafe function on the short-array 413 // benchmark. 414 dstLen := len(dst) 415 nSrcFullByte := dstLen >> 1 416 srcOdd := dstLen & 1 417 if len(src) != nSrcFullByte+srcOdd { 418 panic("PackedNibbleLookup() requires len(src) == (len(dst) + 1) / 2.") 419 } 420 if nSrcFullByte < 16 { 421 for srcPos := 0; srcPos < nSrcFullByte; srcPos++ { 422 srcByte := src[srcPos] 423 dst[2*srcPos] = tablePtr.Get(srcByte & 15) 424 dst[2*srcPos+1] = tablePtr.Get(srcByte >> 4) 425 } 426 } else { 427 srcHeader := (*reflect.SliceHeader)(unsafe.Pointer(&src)) 428 dstHeader := (*reflect.SliceHeader)(unsafe.Pointer(&dst)) 429 packedNibbleLookupOddSSSE3Asm(unsafe.Pointer(dstHeader.Data), unsafe.Pointer(srcHeader.Data), unsafe.Pointer(tablePtr), nSrcFullByte) 430 } 431 if srcOdd == 1 { 432 srcByte := src[nSrcFullByte] 433 dst[2*nSrcFullByte] = tablePtr.Get(srcByte & 15) 434 } 435 } 436 437 // Interleave8Unsafe sets the bytes in dst[] as follows: 438 // if pos is even, dst[pos] := even[pos/2] 439 // if pos is odd, dst[pos] := odd[pos/2] 440 // 441 // WARNING: This is a function designed to be used in inner loops, which makes 442 // assumptions about length and capacity which aren't checked at runtime. Use 443 // the safe version of this function when that's a problem. 444 // Assumptions #2-3 are always satisfied when the last 445 // potentially-size-increasing operation on dst[] is {Re}makeUnsafe(), 446 // ResizeUnsafe(), or XcapUnsafe(), and the same is true for even[] and odd[]. 447 // 448 // 1. len(even) = (len(dst) + 1) / 2, and len(odd) = len(dst) / 2. 449 // 450 // 2. cap(dst) >= RoundUpPow2(len(dst) + 1, bytesPerVec), 451 // cap(even) >= RoundUpPow2(len(even) + 1, bytesPerVec), and 452 // cap(odd) >= RoundUpPow2(len(odd) + 1, bytesPerVec). 453 // 454 // 3. The caller does not care if a few bytes past the end of dst[] are 455 // changed. 456 func Interleave8Unsafe(dst, even, odd []byte) { 457 dstHeader := (*reflect.SliceHeader)(unsafe.Pointer(&dst)) 458 evenHeader := (*reflect.SliceHeader)(unsafe.Pointer(&even)) 459 oddHeader := (*reflect.SliceHeader)(unsafe.Pointer(&odd)) 460 interleave8SSE2Asm(unsafe.Pointer(dstHeader.Data), unsafe.Pointer(evenHeader.Data), unsafe.Pointer(oddHeader.Data), dstHeader.Len) 461 } 462 463 // Interleave8 sets the bytes in dst[] as follows: 464 // if pos is even, dst[pos] := even[pos/2] 465 // if pos is odd, dst[pos] := odd[pos/2] 466 // It panics if ((len(dst) + 1) / 2) != len(even), or (len(dst) / 2) != 467 // len(odd). 468 func Interleave8(dst, even, odd []byte) { 469 // This is ~6-20% slower than the unsafe function on the short-array 470 // benchmark. 471 dstLen := len(dst) 472 evenLen := (dstLen + 1) >> 1 473 oddLen := dstLen >> 1 474 if (len(even) != evenLen) || (len(odd) != oddLen) { 475 panic("Interleave8() requires len(even) == len(dst) + 1) / 2, and len(odd) == len(dst) / 2.") 476 } 477 if oddLen < 16 { 478 for idx, oddByte := range odd { 479 dst[2*idx] = even[idx] 480 dst[2*idx+1] = oddByte 481 } 482 } else { 483 dstHeader := (*reflect.SliceHeader)(unsafe.Pointer(&dst)) 484 evenHeader := (*reflect.SliceHeader)(unsafe.Pointer(&even)) 485 oddHeader := (*reflect.SliceHeader)(unsafe.Pointer(&odd)) 486 interleave8OddSSE2Asm(unsafe.Pointer(dstHeader.Data), unsafe.Pointer(evenHeader.Data), unsafe.Pointer(oddHeader.Data), oddLen) 487 } 488 if oddLen != evenLen { 489 dst[oddLen*2] = even[oddLen] 490 } 491 } 492 493 // Reverse8Inplace reverses the bytes in main[]. (There is no unsafe version 494 // of this function.) 495 func Reverse8Inplace(main []byte) { 496 mainHeader := (*reflect.SliceHeader)(unsafe.Pointer(&main)) 497 reverse8InplaceSSSE3Asm(unsafe.Pointer(mainHeader.Data), mainHeader.Len) 498 } 499 500 // Reverse8Unsafe sets dst[pos] := src[len(src) - 1 - pos] for every position 501 // in src. 502 // 503 // WARNING: This does not verify len(dst) == len(src); call the safe version of 504 // this function if you want that. 505 func Reverse8Unsafe(dst, src []byte) { 506 nByte := len(src) 507 if nByte < BytesPerWord { 508 // could use bswap32 on two uint32s if nByte in 4..7 509 nByteMinus1 := nByte - 1 510 for idx := 0; idx != nByte; idx++ { 511 dst[nByteMinus1-idx] = src[idx] 512 } 513 return 514 } 515 srcData := unsafe.Pointer((*reflect.SliceHeader)(unsafe.Pointer(&src)).Data) 516 dstData := unsafe.Pointer((*reflect.SliceHeader)(unsafe.Pointer(&dst)).Data) 517 if nByte < 16 { 518 // use bswap64 on a word at a time 519 nWordMinus1 := (nByte - 1) >> Log2BytesPerWord 520 finalOffset := uintptr(nByte) - BytesPerWord 521 srcIter := unsafe.Add(srcData, finalOffset) 522 dstIter := dstData 523 for widx := 0; widx < nWordMinus1; widx++ { 524 srcWord := *((*uintptr)(srcIter)) 525 *((*uintptr)(dstIter)) = uintptr(bits.ReverseBytes64(uint64(srcWord))) 526 srcIter = unsafe.Add(srcIter, -BytesPerWord) 527 dstIter = unsafe.Add(dstIter, -BytesPerWord) 528 } 529 srcFirstWordPtr := unsafe.Pointer(srcData) 530 dstLastWordPtr := unsafe.Add(dstData, finalOffset) 531 srcWord := *((*uintptr)(srcFirstWordPtr)) 532 *((*uintptr)(dstLastWordPtr)) = uintptr(bits.ReverseBytes64(uint64(srcWord))) 533 return 534 } 535 reverse8SSSE3Asm(dstData, srcData, nByte) 536 } 537 538 // Reverse8 sets dst[pos] := src[len(src) - 1 - pos] for every position in src. 539 // It panics if len(src) != len(dst). 540 func Reverse8(dst, src []byte) { 541 nByte := len(src) 542 if nByte != len(dst) { 543 panic("Reverse8() requires len(src) == len(dst).") 544 } 545 if nByte < BytesPerWord { 546 // could use bswap32 on two uint32s if nByte in 4..7 547 nByteMinus1 := nByte - 1 548 for idx := 0; idx != nByte; idx++ { 549 dst[nByteMinus1-idx] = src[idx] 550 } 551 return 552 } 553 srcData := unsafe.Pointer((*reflect.SliceHeader)(unsafe.Pointer(&src)).Data) 554 dstData := unsafe.Pointer((*reflect.SliceHeader)(unsafe.Pointer(&dst)).Data) 555 if nByte < 16 { 556 // use bswap64 on a word at a time 557 nWordMinus1 := (nByte - 1) >> Log2BytesPerWord 558 finalOffset := uintptr(nByte) - BytesPerWord 559 srcIter := unsafe.Add(srcData, finalOffset) 560 dstIter := dstData 561 for widx := 0; widx < nWordMinus1; widx++ { 562 srcWord := *((*uintptr)(srcIter)) 563 *((*uintptr)(dstIter)) = uintptr(bits.ReverseBytes64(uint64(srcWord))) 564 srcIter = unsafe.Add(srcIter, -BytesPerWord) 565 dstIter = unsafe.Add(dstIter, -BytesPerWord) 566 } 567 srcFirstWordPtr := srcData 568 dstLastWordPtr := unsafe.Add(dstData, finalOffset) 569 srcWord := *((*uintptr)(srcFirstWordPtr)) 570 *((*uintptr)(dstLastWordPtr)) = uintptr(bits.ReverseBytes64(uint64(srcWord))) 571 return 572 } 573 reverse8SSSE3Asm(dstData, srcData, nByte) 574 } 575 576 // BitFromEveryByte fills dst[] with a bitarray containing every 8th bit from 577 // src[], starting with bitIdx, where bitIdx is in [0,7]. If len(src) is not 578 // divisible by 8, extra bits in the last filled byte of dst are set to zero. 579 // 580 // For example, if src[] is 581 // 0x1f 0x33 0x0d 0x00 0x51 0xcc 0x34 0x59 0x44 582 // and bitIdx is 2, bit 2 from every byte is 583 // 1 0 1 0 0 1 1 0 1 584 // so dst[] is filled with 585 // 0x65 0x01. 586 // 587 // - It panics if len(dst) < (len(src) + 7) / 8, or bitIdx isn't in [0,7]. 588 // - If dst is larger than necessary, the extra bytes are not changed. 589 func BitFromEveryByte(dst, src []byte, bitIdx int) { 590 requiredDstLen := (len(src) + 7) >> 3 591 if (len(dst) < requiredDstLen) || (uint(bitIdx) > 7) { 592 panic("BitFromEveryByte requires len(dst) >= (len(src) + 7) / 8 and 0 <= bitIdx < 8.") 593 } 594 nSrcVecByte := len(src) &^ (bytesPerVec - 1) 595 if nSrcVecByte != 0 { 596 bitFromEveryByteSSE2Asm(unsafe.Pointer(&dst[0]), unsafe.Pointer(&src[0]), 7-bitIdx, nSrcVecByte>>3) 597 } 598 remainder := len(src) - nSrcVecByte 599 if remainder != 0 { 600 // Not optimized since it isn't expected to matter. 601 srcLast := src[nSrcVecByte:] 602 dstLast := dst[nSrcVecByte>>3 : requiredDstLen] 603 for i := range dstLast { 604 dstLast[i] = 0 605 } 606 for i, b := range srcLast { 607 dstLast[i>>3] |= ((b >> uint32(bitIdx)) & 1) << uint32(i&7) 608 } 609 } 610 }