github.com/grailbio/base@v0.0.11/simd/cmp_amd64.s (about)

     1  // Copyright 2018 GRAIL, Inc.  All rights reserved.
     2  // Use of this source code is governed by the Apache-2.0
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build amd64,!appengine
     6  
     7  TEXT ·firstGreater8SSSE3Asm(SB),4,$0-40
     8          MOVQ    arg+0(FP), DI
     9          MOVQ	val+8(FP), BX
    10          MOVQ    startPos+16(FP), AX
    11          MOVQ    endPos+24(FP), R9
    12  
    13          ADDQ    DI, AX
    14          // AX is now &(arg[startPos])
    15          PXOR    X1, X1
    16          LEAQ    -16(DI)(R9*1), R8
    17          // R8 is now &(arg[endPos - 16])
    18  
    19          // We now distinguish two cases.
    20          // 1. val <= 127.  Then we saturate-add (127 - val) to each byte before
    21          //    movemask.
    22          // 2. val > 127.  Then we saturate-subtract (val - 127) from each byte
    23          //    before movemask.
    24          CMPQ    BX, $127
    25          JG      firstGreater8SSSE3HighVal
    26  
    27          XORQ    $127, BX
    28          MOVD    BX, X0
    29          PSHUFB  X1, X0
    30          // all bytes of X0 are now equal to (127 - val)
    31          CMPQ    R8, AX
    32          JLE     firstGreater8SSSE3LowValFinal
    33  
    34  firstGreater8SSSE3LowValLoop:
    35          MOVOU   (AX), X1
    36          PADDUSB X0, X1
    37          PMOVMSKB        X1, BX
    38          TESTQ   BX, BX
    39          JNE     firstGreater8SSSE3Found
    40          ADDQ    $16, AX
    41          CMPQ    R8, AX
    42          JG      firstGreater8SSSE3LowValLoop
    43  
    44  firstGreater8SSSE3LowValFinal:
    45          MOVQ    R8, AX
    46          MOVOU   (R8), X1
    47          PADDUSB X0, X1
    48          PMOVMSKB        X1, BX
    49          TESTQ   BX, BX
    50          JNE     firstGreater8SSSE3Found
    51          MOVQ    R9, ret+32(FP)
    52          RET
    53  
    54  firstGreater8SSSE3Found:
    55          BSFQ    BX, DX
    56          SUBQ    DI, AX
    57          ADDQ    DX, AX
    58          MOVQ    AX, ret+32(FP)
    59          RET
    60  
    61  firstGreater8SSSE3HighVal:
    62          SUBQ    $127, BX
    63          MOVD    BX, X0
    64          PSHUFB  X1, X0
    65          // all bytes of X0 are now equal to (val - 127)
    66          CMPQ    R8, AX
    67          JLE     firstGreater8SSSE3HighValFinal
    68  
    69  firstGreater8SSSE3HighValLoop:
    70          MOVOU   (AX), X1
    71          PSUBUSB X0, X1
    72          PMOVMSKB        X1, BX
    73          TESTQ   BX, BX
    74          JNE     firstGreater8SSSE3Found
    75          ADDQ    $16, AX
    76          CMPQ    R8, AX
    77          JG      firstGreater8SSSE3HighValLoop
    78  
    79  firstGreater8SSSE3HighValFinal:
    80          MOVQ    R8, AX
    81          MOVOU   (R8), X1
    82          PSUBUSB X0, X1
    83          PMOVMSKB        X1, BX
    84          TESTQ   BX, BX
    85          JNE     firstGreater8SSSE3Found
    86          MOVQ    R9, ret+32(FP)
    87          RET
    88  
    89  
    90  TEXT ·firstLeq8SSSE3Asm(SB),4,$0-40
    91          MOVQ    arg+0(FP), DI
    92          MOVD	val+8(FP), X0
    93          MOVQ    startPos+16(FP), AX
    94          MOVQ    endPos+24(FP), R9
    95  
    96          ADDQ    DI, AX
    97          // AX is now &(arg[startPos])
    98          PXOR    X1, X1
    99          // X1 is a fixed all-zero vector
   100          LEAQ    -16(DI)(R9*1), R8
   101          // R8 is now &(arg[endPos - 16])
   102          PSHUFB  X1, X0
   103          // all bytes of X0 are now equal to val
   104          CMPQ    R8, AX
   105          JLE     firstLeq8SSSE3Final
   106  
   107  firstLeq8SSSE3Loop:
   108          MOVOU   (AX), X2
   109          PSUBUSB X0, X2
   110          // X2 is 0 for all bytes originally <= val, and nonzero otherwise.
   111          PCMPEQB X1, X2
   112          // X2 is now 255 for all bytes originally <= val, and 0 otherwise.
   113          PMOVMSKB        X2, BX
   114          TESTQ   BX, BX
   115          JNE     firstLeq8SSSE3Found
   116          ADDQ    $16, AX
   117          CMPQ    R8, AX
   118          JG      firstLeq8SSSE3Loop
   119  
   120  firstLeq8SSSE3Final:
   121          MOVQ    R8, AX
   122          MOVOU   (R8), X2
   123          PSUBUSB X0, X2
   124          PCMPEQB X1, X2
   125          PMOVMSKB        X2, BX
   126          TESTQ   BX, BX
   127          JNE     firstLeq8SSSE3Found
   128          MOVQ    R9, ret+32(FP)
   129          RET
   130  
   131  firstLeq8SSSE3Found:
   132          BSFQ    BX, DX
   133          SUBQ    DI, AX
   134          ADDQ    DX, AX
   135          MOVQ    AX, ret+32(FP)
   136          RET