github.com/grailbio/base@v0.0.11/simd/add_amd64.s (about)

     1  // Copyright 2018 GRAIL, Inc.  All rights reserved.
     2  // Use of this source code is governed by the Apache-2.0
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build amd64,!appengine
     6  
     7  TEXT ·addConst8TinyInplaceSSSE3Asm(SB),4,$0-16
     8          // DI = pointer to current main[] element.
     9          MOVQ    main+0(FP), DI
    10          MOVD    val+8(FP), X0
    11  
    12          PXOR    X1, X1
    13          PSHUFB  X1, X0
    14          // all bytes of X0 are now equal to val
    15  
    16          MOVOU   (DI), X1
    17          PADDB   X0, X1
    18          MOVOU   X1, (DI)
    19          RET
    20  
    21  TEXT ·addConst8OddInplaceSSSE3Asm(SB),4,$0-24
    22          // DI = pointer to current main[] element.
    23          MOVQ    main+0(FP), DI
    24          MOVD    val+8(FP), X0
    25          MOVQ    nByte+16(FP), SI
    26  
    27          PXOR    X1, X1
    28          PSHUFB  X1, X0
    29          // all bytes of X0 are now equal to val
    30  
    31          LEAQ    -32(DI)(SI*1), AX
    32          CMPQ    AX, DI
    33          JLE     addConst8OddInplaceSSSE3Final
    34  
    35  addConst8OddInplaceSSSE3Loop:
    36          // tried 2x unroll, benefit appears to exist but is smaller than ~4% so
    37          // I won't bother for now
    38          MOVOU   (DI), X1
    39          PADDB   X0, X1
    40          MOVOU   X1, (DI)
    41          ADDQ    $16, DI
    42          CMPQ    AX, DI
    43          JG      addConst8OddInplaceSSSE3Loop
    44  
    45  addConst8OddInplaceSSSE3Final:
    46          // Load and parallel-add to last two vectors (which usually overlap)
    47          // simultaneously, before writing back.
    48          ADDQ    $16, AX
    49          MOVOU   (DI), X1
    50          MOVOU   (AX), X2
    51          PADDB   X0, X1
    52          PADDB   X0, X2
    53          MOVOU   X1, (DI)
    54          MOVOU   X2, (AX)
    55          RET
    56  
    57  TEXT ·addConst8SSSE3Asm(SB),4,$0-32
    58          // DI = pointer to current src[] element.
    59          // R8 = pointer to current dst[] element.
    60          MOVQ    dst+0(FP), R8
    61          MOVQ    src+8(FP), DI
    62          MOVD	val+16(FP), X0
    63          MOVQ	nByte+24(FP), SI
    64  
    65          PXOR    X1, X1
    66          PSHUFB  X1, X0
    67          // all bytes of X0 are now equal to val
    68  
    69          // SI = pointer to end of src[].
    70          ADDQ    DI, SI
    71  
    72  addConst8SSSE3Loop:
    73          MOVOU   (DI), X1
    74          PADDB   X0, X1
    75          MOVOU   X1, (R8)
    76          ADDQ    $16, DI
    77          ADDQ    $16, R8
    78          CMPQ    SI, DI
    79          JG      addConst8SSSE3Loop
    80  
    81          RET
    82  
    83  TEXT ·addConst8OddSSSE3Asm(SB),4,$0-32
    84          // DI = pointer to current src[] element.
    85          // R8 = pointer to current dst[] element.
    86          MOVQ    dst+0(FP), R8
    87          MOVQ    src+8(FP), DI
    88          MOVD	val+16(FP), X0
    89          MOVQ	nByte+24(FP), BX
    90  
    91          PXOR    X1, X1
    92          PSHUFB  X1, X0
    93  
    94          // set AX to 16 bytes before end of src[].
    95          // change BX to 16 bytes before end of dst[].
    96          SUBQ    $16, BX
    97          LEAQ    0(DI)(BX*1), AX
    98          ADDQ    R8, BX
    99  
   100  addConst8OddSSSE3Loop:
   101          MOVOU   (DI), X1
   102          PADDB   X0, X1
   103          MOVOU   X1, (R8)
   104          ADDQ    $16, DI
   105          ADDQ    $16, R8
   106          CMPQ    AX, DI
   107          JG      addConst8OddSSSE3Loop
   108  
   109          // Final usually-unaligned read and write.
   110          MOVOU   (AX), X1
   111          PADDB   X0, X1
   112          MOVOU   X1, (BX)
   113          RET
   114  
   115  TEXT ·subtractFromConst8TinyInplaceSSSE3Asm(SB),4,$0-16
   116          // Almost identical to addConst8TinyInplaceSSSE3Asm.
   117          // DI = pointer to current main[] element.
   118          MOVQ    main+0(FP), DI
   119          MOVD    val+8(FP), X0
   120  
   121          PXOR    X1, X1
   122          PSHUFB  X1, X0
   123          // all bytes of X0 are now equal to val
   124  
   125          MOVOU   (DI), X1
   126          PSUBB   X1, X0
   127          MOVOU   X0, (DI)
   128          RET
   129  
   130  TEXT ·subtractFromConst8OddInplaceSSSE3Asm(SB),4,$0-24
   131          // Almost identical to addConst8OddInplaceSSSE3Asm.
   132          // DI = pointer to current main[] element.
   133          MOVQ    main+0(FP), DI
   134          MOVD    val+8(FP), X0
   135          MOVQ    nByte+16(FP), SI
   136  
   137          PXOR    X1, X1
   138          PSHUFB  X1, X0
   139          // all bytes of X0 are now equal to val
   140  
   141          LEAQ    -32(DI)(SI*1), BX
   142          CMPQ    BX, DI
   143          JLE     subtractFromConst8OddInplaceSSSE3Final
   144  
   145  subtractFromConst8OddInplaceSSSE3Loop:
   146          MOVOU   (DI), X2
   147          MOVO    X0, X1
   148          PSUBB   X2, X1
   149          MOVOU   X1, (DI)
   150          ADDQ    $16, DI
   151          CMPQ    BX, DI
   152          JG      subtractFromConst8OddInplaceSSSE3Loop
   153  
   154  subtractFromConst8OddInplaceSSSE3Final:
   155          ADDQ    $16, BX
   156          MOVOU   (DI), X2
   157          MOVOU   (BX), X3
   158          MOVO    X0, X1
   159          PSUBB   X2, X0
   160          PSUBB   X3, X1
   161          MOVOU   X0, (DI)
   162          MOVOU   X1, (BX)
   163          RET
   164  
   165  TEXT ·subtractFromConst8SSSE3Asm(SB),4,$0-32
   166          // Almost identical to addConst8SSSE3Asm.
   167          // DI = pointer to current src[] element.
   168          // R8 = pointer to current dst[] element.
   169          MOVQ    dst+0(FP), R8
   170          MOVQ    src+8(FP), DI
   171          MOVD	val+16(FP), X0
   172          MOVQ	nByte+24(FP), SI
   173  
   174          PXOR    X1, X1
   175          PSHUFB  X1, X0
   176          // all bytes of X0 are now equal to val
   177  
   178          // SI = pointer to end of src[].
   179          ADDQ    DI, SI
   180  
   181  subtractFromConst8SSSE3Loop:
   182          MOVOU   (DI), X2
   183          MOVO    X0, X1
   184          PSUBB   X2, X1
   185          MOVOU   X1, (R8)
   186          ADDQ    $16, DI
   187          ADDQ    $16, R8
   188          CMPQ    SI, DI
   189          JG      subtractFromConst8SSSE3Loop
   190  
   191          RET
   192  
   193  TEXT ·subtractFromConst8OddSSSE3Asm(SB),4,$0-32
   194          // Almost identical to addConst8OddSSSE3Asm.
   195          // DI = pointer to current src[] element.
   196          // R8 = pointer to current dst[] element.
   197          MOVQ    dst+0(FP), R8
   198          MOVQ    src+8(FP), DI
   199          MOVD	val+16(FP), X0
   200          MOVQ	nByte+24(FP), BX
   201  
   202          PXOR    X1, X1
   203          PSHUFB  X1, X0
   204  
   205          // set AX to 16 bytes before end of src[].
   206          // change BX to 16 bytes before end of dst[].
   207          SUBQ    $16, BX
   208          LEAQ    0(DI)(BX*1), AX
   209          ADDQ    R8, BX
   210  
   211  subtractFromConst8OddSSSE3Loop:
   212          MOVOU   (DI), X2
   213          MOVO    X0, X1
   214          PSUBB   X2, X1
   215          MOVOU   X1, (R8)
   216          ADDQ    $16, DI
   217          ADDQ    $16, R8
   218          CMPQ    AX, DI
   219          JG      subtractFromConst8OddSSSE3Loop
   220  
   221          // Final usually-unaligned read and write.
   222          MOVOU   (AX), X1
   223          PSUBB   X1, X0
   224          MOVOU   X0, (BX)
   225          RET