github.com/grailbio/base@v0.0.11/simd/multibyte_amd64.s

github.com/grailbio/base@v0.0.11/simd/multibyte_amd64.s (about)

     1  // Copyright 2018 GRAIL, Inc.  All rights reserved.
     2  // Use of this source code is governed by the Apache-2.0
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build amd64,!appengine
     6  
     7          DATA ·Reverse16<>+0x00(SB)/8, $0x09080b0a0d0c0f0e
     8          DATA ·Reverse16<>+0x08(SB)/8, $0x0100030205040706
     9          GLOBL ·Reverse16<>(SB), 24, $16
    10          // NOPTR = 16, RODATA = 8
    11  
    12  TEXT ·index16SSE2Asm(SB),4,$0-32
    13          // index16SSE2Asm scans main[], searching for the first instance of
    14          // val.  If no instances are found, it returns -1.
    15          // It requires nElem >= 8.
    16          // The implementation is based on a loop which uses _mm_cmpeq_epi16()
    17          // to scan 8 uint16s in parallel, and _mm_movemask_epi8() to extract
    18          // the result of that scan.  It is similar to firstLeq8 in cmp_amd64.s.
    19  
    20          // There's a ~10% benefit from 2x-unrolling the main loop so that only
    21          // one test is performed per loop iteration (i.e. just look at the
    22          // bitwise-or of the comparison results, and backtrack a bit on a hit).
    23          // I'll leave that on the table for now to keep the logic simpler.
    24  
    25          // Register allocation:
    26          //   AX: pointer to start of main[]
    27          //   BX: nElem - 8
    28          //   CX: current index
    29          //   X0: vector with 8 copies of val
    30          MOVQ    main+0(FP), AX
    31  
    32          // clang compiles _mm_set1_epi16() to this, I'll trust it.
    33          MOVQ    val+8(FP), X0
    34          PSHUFLW $0xe0, X0, X0
    35          PSHUFD  $0, X0, X0
    36  
    37          MOVQ    nElem+16(FP), BX
    38          SUBQ    $8, BX
    39          XORL    CX, CX
    40  
    41  index16SSE2AsmLoop:
    42          // Scan 8 elements starting from &(main[CX]).
    43          MOVOU   (AX)(CX*2), X1
    44          PCMPEQW X0, X1
    45          PMOVMSKB        X1, DX
    46          // Bits 2k and 2k+1 are now set in DX iff the uint16 at position k
    47          // compared equal.
    48          TESTQ   DX, DX
    49          JNE     index16SSE2AsmFound
    50          ADDQ    $8, CX
    51          CMPQ    BX, CX
    52          JG      index16SSE2AsmLoop
    53  
    54          // Scan the last 8 elements; this may partially overlap with the
    55          // previous scan.
    56          MOVQ    BX, CX
    57          MOVOU   (AX)(CX*2), X1
    58          PCMPEQW X0, X1
    59          PMOVMSKB        X1, DX
    60          TESTQ   DX, DX
    61          JNE     index16SSE2AsmFound
    62          // No match found, return -1.
    63          MOVQ    $-1, ret+24(FP)
    64          RET
    65  
    66  index16SSE2AsmFound:
    67          BSFQ    DX, AX
    68          // AX now has the index of the lowest set bit in DX.
    69          SHRQ    $1, AX
    70          ADDQ    CX, AX
    71          MOVQ    AX, ret+24(FP)
    72          RET
    73  
    74  TEXT ·reverse16InplaceSSSE3Asm(SB),4,$0-16
    75          // This is only called with nElem > 8.  So we can safely divide this
    76          // into two cases:
    77          // 1. (nElem+7) % 16 in {0..7}.  Execute (nElem+7)/16 normal iterations
    78          //    and exit.  Last two writes usually overlap.
    79          // 2. (nElem+7) % 16 in {8..15}.  Execute (nElem-9)/16 normal
    80          //    iterations.  Then we have between 17 and 24 central elements
    81          //    left; handle them by processing *three* vectors at once at the
    82          //    end.
    83          // Logic is essentially identical to reverseComp4InplaceSSSE3Asm,
    84          // except we don't need to complement here.
    85          MOVQ    main+0(FP), SI
    86          MOVQ    nElem+8(FP), AX
    87  
    88          // DI iterates backwards from the end of seq8[].
    89          LEAQ    -16(SI)(AX*2), DI
    90  
    91          MOVOU   ·Reverse16<>(SB), X0
    92          SUBQ    $1, AX
    93          MOVQ    AX, BX
    94          ANDQ    $8, BX
    95          // BX is now 0 when we don't need to process 3 vectors at the end, and
    96          // 8 when we do.
    97          LEAQ    0(AX)(BX*2), R9
    98          // R9 is now nElem+15 when we don't need to process 3 vectors at the
    99          // end, and nElem-1 when we do.
   100          LEAQ    -24(SI)(R9*1), AX
   101          // AX can now be used for the loop termination check:
   102          //   if nElem == 9, R9 == 24, so AX == uintptr(main) + 0.
   103          //   if nElem == 16, R9 == 31, so AX == uintptr(main) + 7.
   104          //   if nElem == 17, R9 == 16, so AX == uintptr(main) - 8.
   105          //   if nElem == 24, R9 == 23, so AX == uintptr(main) - 1.
   106          CMPQ    AX, SI
   107          JL      reverse16InplaceSSSE3LastThree
   108  
   109  reverse16InplaceSSSE3Loop:
   110          MOVOU   (SI), X1
   111          MOVOU   (DI), X2
   112          PSHUFB  X0, X1
   113          PSHUFB  X0, X2
   114          MOVOU   X2, (SI)
   115          MOVOU   X1, (DI)
   116          ADDQ    $16, SI
   117          SUBQ    $16, DI
   118          CMPQ    AX, SI
   119          JGE     reverse16InplaceSSSE3Loop
   120  
   121          TESTQ   BX, BX
   122          JNE     reverse16InplaceSSSE3Ret
   123  reverse16InplaceSSSE3LastThree:
   124          MOVOU   (SI), X1
   125          MOVOU   16(SI), X2
   126          MOVOU   (DI), X3
   127          PSHUFB  X0, X1
   128          PSHUFB  X0, X2
   129          PSHUFB  X0, X3
   130          MOVOU   X3, (SI)
   131          MOVOU   X2, -16(DI)
   132          MOVOU   X1, (DI)
   133  
   134  reverse16InplaceSSSE3Ret:
   135          RET
   136  
   137  TEXT ·reverse16SSSE3Asm(SB),4,$0-24
   138          MOVQ    dst+0(FP), DI
   139          MOVQ    src+8(FP), SI
   140          MOVQ    nElem+16(FP), AX
   141  
   142          // R8 iterates backwards from the end of src[].
   143          LEAQ    -16(SI)(AX*2), R8
   144          MOVOU   ·Reverse16<>(SB), X0
   145          // Save final dst[] pointer for later.
   146          LEAQ    -16(DI)(AX*2), R9
   147  
   148  reverse16SSSE3Loop:
   149          MOVOU   (R8), X1
   150          PSHUFB  X0, X1
   151          MOVOU   X1, (DI)
   152          SUBQ    $16, R8
   153          ADDQ    $16, DI
   154          CMPQ    SI, R8
   155          JL      reverse16SSSE3Loop
   156  
   157          MOVOU   (SI), X1
   158          PSHUFB  X0, X1
   159          MOVOU   X1, (R9)
   160          RET