github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/simd/count_amd64.s

github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/simd/count_amd64.s (about)

     1  // Copyright 2018 GRAIL, Inc.  All rights reserved.
     2  // Use of this source code is governed by the Apache-2.0
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build amd64,!appengine
     6  
     7          DATA ·Mask0f0f<>+0x00(SB)/8, $0x0f0f0f0f0f0f0f0f
     8          DATA ·Mask0f0f<>+0x08(SB)/8, $0x0f0f0f0f0f0f0f0f
     9          GLOBL ·Mask0f0f<>(SB), 24, $16
    10          // NOPTR = 16, RODATA = 8
    11  
    12          DATA LeadingByteMask<>+0x00(SB)/8, $0x0000000000000000
    13          DATA LeadingByteMask<>+0x08(SB)/8, $0x0000000000000000
    14          DATA LeadingByteMask<>+0x10(SB)/8, $0xffffffffffffffff
    15          DATA LeadingByteMask<>+0x18(SB)/8, $0xffffffffffffffff
    16          GLOBL LeadingByteMask<>(SB), 24, $32
    17  
    18  // Original code had LOOP, which was substantially slower than JNE.  2x
    19  // unrolling of the loop also makes a big (~35% with single thread) difference,
    20  // probably because of POPCNTQ's 3-cycle latency.
    21  // (4x benchmarks even better than 2x on both my test machines, but only by
    22  // ~5%, so I'll skip it and spend any remaining implementation effort on Mula's
    23  // faster AVX2 algorithm.)
    24  TEXT ·popcntWordArraySSE42Asm(SB),4,$0-24
    25          MOVQ	nWord+8(FP), BX
    26          XORQ	AX, AX
    27          TESTQ   BX, BX
    28          // Length == 0?  Return immediately.
    29          JE      popcntWordArraySSE42Finish
    30  
    31          // SI = pointer to current element of array.
    32          MOVQ	bytes+0(FP), SI
    33          // DI = pointer to last element of array.
    34          LEAQ    -8(SI)(BX*8), DI
    35          ANDQ    $1, BX
    36          // Skip next block if array length is even.
    37          JE      popcntWordArraySSE42Loop
    38  
    39          POPCNTQ (SI), AX
    40          CMPQ    DI, SI
    41          // If array length was exactly 1, return.
    42          JE      popcntWordArraySSE42Finish
    43          ADDQ    $8, SI
    44  
    45  popcntWordArraySSE42Loop:
    46          // Remaining word count must be even.  Process 2 words at a time.
    47          POPCNTQ (SI), DX
    48          ADDQ	DX, AX
    49          POPCNTQ 8(SI), R8
    50          ADDQ    R8, AX
    51          ADDQ	$16, SI
    52          CMPQ    DI, SI
    53          JG      popcntWordArraySSE42Loop
    54  
    55  popcntWordArraySSE42Finish:
    56          MOVQ	AX, ret+16(FP)
    57          RET
    58  
    59  TEXT ·maskThenCountByteSSE41Asm(SB),4,$0-40
    60          // This assumes nByte >= 16.  We can revisit the tiny case if it's ever
    61          // a bottleneck, but I'd guess that the obvious Golang loop is close
    62          // enough to optimal there.
    63          MOVQ	src+0(FP), DI
    64          MOVD    mask+8(FP), X0
    65          MOVD    val+16(FP), X1
    66          MOVQ    nByte+24(FP), BX
    67  
    68          // Make X6 a permanent all-zero vector.
    69          PXOR    X6, X6
    70  
    71          // Save start of last 16-byte vector.  This is used for both
    72          // loop-comparison and as the final (usually-)unaligned-load address.
    73          LEAQ    -16(DI)(BX*1), SI
    74          // Make all bytes of X0 and X1 equal to mask and val, respectively.
    75          PSHUFB  X6, X0
    76          PSHUFB  X6, X1
    77  
    78          MOVQ    $LeadingByteMask<>(SB), AX
    79          // X2 is the "inner accumulator".  Each of its bytes is a count from
    80          // 0-255 of how many masked bytes of src[] at the same position mod 16
    81          // are equal to val.
    82          // Up to 255*16=4080 bytes, this is enough.  Beyond that, the counts
    83          // might overflow, so we save the intermediate results to an "outer
    84          // accumulator" (X7), as a pair of uint64 counts.  Conveniently, the
    85          // PSADBW instruction directly converts the inner count representation
    86          // to the outer representation, though its latency is high enough that
    87          // it's best avoided in innermost loops.
    88          PXOR    X2, X2
    89          PXOR    X7, X7
    90          CMPQ    BX, $4080
    91          JG      maskThenCountByteSSE41Large
    92  
    93  maskThenCountByteSSE41Loop:
    94          // Load 16 bytes from src.
    95          MOVOU   (DI), X3
    96          // Apply mask.
    97          PAND    X0, X3
    98          // Parallel-compare.
    99          PCMPEQB X1, X3
   100          // X3 now has 255 on equality, and zero on inequality.
   101          // Bytewise-*subtract* it from the inner accumulator, to add 1 for each
   102          // match.
   103          PSUBB   X3, X2
   104          // Advance to next 16 bytes, check for loop end.
   105          ADDQ    $16, DI
   106          CMPQ    SI, DI
   107          JG      maskThenCountByteSSE41Loop
   108  
   109          // Load last 16 bytes from src.  This may overlap with the previous
   110          // main-loop load.
   111          MOVOU   (SI), X3
   112          PAND    X0, X3
   113          PCMPEQB X1, X3
   114          // We now want to mask out the first k bytes of X3, where k is the
   115          // number of overlapping bytes between the last two loads.
   116          //
   117          // DI is the end of the previous load, while SI is the position of the
   118          // current load.  So (SI-DI) is a number in [-16, 0] which is
   119          // -[# of overlapping bytes].
   120          // AX points to 'LeadingByteMask', which is a read-only array where the
   121          // first 16 bytes are 0, and the next 16 bytes are 255.
   122          // So:
   123          // - if SI-DI=-16, LEAQ 16(AX)(SI*1) loads the 16 bytes starting from
   124          //   &(LeadingByteMask[16 + (-16)]).  These are all zero, which is the
   125          //   correct mask when all 16 bytes overlap.
   126          // - if SI-DI=-15, LEAQ 16(AX)(SI*1) loads the 16 bytes starting from
   127          //   &(LeadingByteMask[16 + (-15)]).  The first 15 bytes are zero, and
   128          //   the last byte is 255.  This is the correct mask when the first 15
   129          //   bytes overlap.
   130          // etc.
   131          SUBQ    DI, SI
   132          LEAQ    16(AX)(SI*1), DX
   133          MOVOU   (DX), X4
   134          PAND    X4, X3
   135          PSUBB   X3, X2
   136          // Now extract results.
   137          PSADBW  X6, X2
   138          PADDQ   X2, X7
   139          MOVQ    X7, BX
   140          PEXTRQ  $1, X7, AX
   141          ADDQ    AX, BX
   142          MOVQ    BX, ret+32(FP)
   143          RET
   144  
   145  maskThenCountByteSSE41Large:
   146          MOVQ    DI, R8
   147          // Tried forcing 16-byte alignment, no noticeable performance impact on
   148          // my Mac.
   149          // Stop this loop after 254 rather than 255 iterations.  This way,
   150          // we guarantee there are at least 16 bytes left when we jump to
   151          // maskThenCountByteSSE41Loop.
   152          ADDQ    $4064, R8
   153  maskThenCountByteSSE41LargeLoop:
   154          MOVOU   (DI), X3
   155          PAND    X0, X3
   156          PCMPEQB X1, X3
   157          PSUBB   X3, X2
   158          ADDQ    $16, DI
   159          CMPQ    R8, DI
   160          JG      maskThenCountByteSSE41LargeLoop
   161  
   162          PSADBW  X6, X2
   163          PADDQ   X2, X7
   164          PXOR    X2, X2
   165          SUBQ    $4064, BX
   166          CMPQ    BX, $4080
   167          JG      maskThenCountByteSSE41Large
   168          JMP     maskThenCountByteSSE41Loop
   169  
   170  TEXT ·count2BytesSSE41Asm(SB),4,$0-40
   171          // This is almost identical to maskThenCountByteSSE41Asm.
   172          MOVQ	src+0(FP), DI
   173          MOVD    val1+8(FP), X0
   174          MOVD    val2+16(FP), X1
   175          MOVQ    nByte+24(FP), BX
   176  
   177          PXOR    X6, X6
   178          LEAQ    -16(DI)(BX*1), SI
   179          PSHUFB  X6, X0
   180          PSHUFB  X6, X1
   181  
   182          PXOR    X2, X2
   183          PXOR    X7, X7
   184          CMPQ    BX, $4080
   185          JG      count2BytesSSE41Large
   186  
   187  count2BytesSSE41Loop:
   188          MOVOU   (DI), X3
   189          MOVO    X3, X4
   190          PCMPEQB X0, X3
   191          PCMPEQB X1, X4
   192          POR     X4, X3
   193          PSUBB   X3, X2
   194          ADDQ    $16, DI
   195          CMPQ    SI, DI
   196          JG      count2BytesSSE41Loop
   197  
   198          MOVOU   (SI), X3
   199          PCMPEQB X3, X0
   200          PCMPEQB X3, X1
   201          POR     X1, X0
   202          MOVQ    $LeadingByteMask<>(SB), AX
   203          SUBQ    DI, SI
   204          LEAQ    16(AX)(SI*1), DX
   205          MOVOU   (DX), X4
   206          PAND    X4, X0
   207          PSUBB   X0, X2
   208          PSADBW  X6, X2
   209          PADDQ   X2, X7
   210          MOVQ    X7, BX
   211          PEXTRQ  $1, X7, AX
   212          ADDQ    AX, BX
   213          MOVQ    BX, ret+32(FP)
   214          RET
   215  
   216  count2BytesSSE41Large:
   217          MOVQ    DI, R8
   218          ADDQ    $4064, R8
   219  count2BytesSSE41LargeLoop:
   220          MOVOU   (DI), X3
   221          MOVO    X3, X4
   222          PCMPEQB X0, X3
   223          PCMPEQB X1, X4
   224          POR     X4, X3
   225          PSUBB   X3, X2
   226          ADDQ    $16, DI
   227          CMPQ    R8, DI
   228          JG      count2BytesSSE41LargeLoop
   229  
   230          PSADBW  X6, X2
   231          PADDQ   X2, X7
   232          PXOR    X2, X2
   233          SUBQ    $4064, BX
   234          CMPQ    BX, $4080
   235          JG      count2BytesSSE41Large
   236          JMP     count2BytesSSE41Loop
   237  
   238  TEXT ·count3BytesSSE41Asm(SB),4,$0-48
   239          // This is almost identical to maskThenCountByteSSE41Asm.
   240          MOVQ	src+0(FP), DI
   241          MOVD    val1+8(FP), X0
   242          MOVD    val2+16(FP), X1
   243          MOVD    val3+24(FP), X8
   244          MOVQ    nByte+32(FP), BX
   245  
   246          PXOR    X6, X6
   247          LEAQ    -16(DI)(BX*1), SI
   248          PSHUFB  X6, X0
   249          PSHUFB  X6, X1
   250          PSHUFB  X6, X8
   251  
   252          PXOR    X2, X2
   253          PXOR    X7, X7
   254          CMPQ    BX, $4080
   255          JG      count3BytesSSE41Large
   256  
   257  count3BytesSSE41Loop:
   258          MOVOU   (DI), X3
   259          MOVO    X3, X4
   260          MOVO    X3, X9
   261          PCMPEQB X0, X3
   262          PCMPEQB X1, X4
   263          PCMPEQB X8, X9
   264          POR     X4, X3
   265          POR     X9, X3
   266          PSUBB   X3, X2
   267          ADDQ    $16, DI
   268          CMPQ    SI, DI
   269          JG      count3BytesSSE41Loop
   270  
   271          MOVOU   (SI), X3
   272          PCMPEQB X3, X0
   273          PCMPEQB X3, X1
   274          PCMPEQB X3, X8
   275          POR     X1, X0
   276          POR     X8, X0
   277          MOVQ    $LeadingByteMask<>(SB), AX
   278          SUBQ    DI, SI
   279          LEAQ    16(AX)(SI*1), DX
   280          MOVOU   (DX), X4
   281          PAND    X4, X0
   282          PSUBB   X0, X2
   283          PSADBW  X6, X2
   284          PADDQ   X2, X7
   285          MOVQ    X7, BX
   286          PEXTRQ  $1, X7, AX
   287          ADDQ    AX, BX
   288          MOVQ    BX, ret+40(FP)
   289          RET
   290  
   291  count3BytesSSE41Large:
   292          MOVQ    DI, R8
   293          ADDQ    $4064, R8
   294  count3BytesSSE41LargeLoop:
   295          MOVOU   (DI), X3
   296          MOVO    X3, X4
   297          MOVO    X3, X9
   298          PCMPEQB X0, X3
   299          PCMPEQB X1, X4
   300          PCMPEQB X8, X9
   301          POR     X4, X3
   302          POR     X9, X3
   303          PSUBB   X3, X2
   304          ADDQ    $16, DI
   305          CMPQ    R8, DI
   306          JG      count3BytesSSE41LargeLoop
   307  
   308          PSADBW  X6, X2
   309          PADDQ   X2, X7
   310          PXOR    X2, X2
   311          SUBQ    $4064, BX
   312          CMPQ    BX, $4080
   313          JG      count3BytesSSE41Large
   314          JMP     count3BytesSSE41Loop
   315  
   316  TEXT ·countNibblesInSetSSE41Asm(SB),4,$0-32
   317          // This is a hybrid of unpackSeqSSE2Asm and the byte-counting functions
   318          // above.
   319          //
   320          // It assumes nSrcByte >= 16.  We can revisit the tiny case if it's
   321          // ever a bottleneck.
   322          MOVQ	src+0(FP), DI
   323          MOVQ    tablePtr+8(FP), DX
   324          MOVQ    nByte+16(FP), BX
   325  
   326          MOVOU   ·Mask0f0f<>(SB), X0
   327  
   328          // Make X8 a permanent all-zero vector.
   329          PXOR    X8, X8
   330  
   331          // Save start of last 16-byte vector.  This is used for both
   332          // loop-comparison and as the final (usually-)unaligned-load address.
   333          LEAQ    -16(DI)(BX*1), SI
   334          MOVOU   (DX), X1
   335  
   336          // X2 is the "inner accumulator".  Each of its bytes is a count from
   337          // 0-254 of how many bases of src[] at the same position mod 16 are
   338          // equal to val.
   339          // Up to 127*16=2032 bytes, this is enough, assuming all table entries
   340          // are in {0, 1}.  Beyond that, the counts might overflow, so we save
   341          // the intermediate results to an "outer accumulator" (X7), as a pair
   342          // of uint64 counts.  Conveniently, the PSADBW instruction directly
   343          // converts the inner count representation to the outer representation,
   344          // though its latency is high enough that it's best avoided in
   345          // innermost loops.
   346          PXOR    X2, X2
   347          PXOR    X7, X7
   348          CMPQ    BX, $2032
   349          JG      countNibblesInSetSSE41Large
   350  
   351  countNibblesInSetSSE41Loop:
   352          // Load 16 bytes from src.
   353          MOVOU   (DI), X3
   354          // Separate high and low nibbles.
   355          MOVO    X3, X4
   356          PSRLQ   $4, X3
   357          PAND    X0, X4
   358          PAND    X0, X3
   359          // Check set membership.
   360          MOVO    X1, X5
   361          MOVO    X1, X6
   362          PSHUFB  X3, X5
   363          PSHUFB  X4, X6
   364          // X5 and X6 now have 1 for set members, and 0 for non-members.  Add
   365          // them to the inner accumulator.
   366          PADDB   X5, X2
   367          PADDB   X6, X2
   368          // Advance to next 16 bytes, check for loop end.
   369          ADDQ    $16, DI
   370          CMPQ    SI, DI
   371          JG      countNibblesInSetSSE41Loop
   372  
   373          // Load last 16 bytes from src.  This may overlap with the previous
   374          // main-loop load.
   375          MOVOU   (SI), X3
   376          MOVO    X3, X4
   377          PSRLQ   $4, X3
   378          PAND    X0, X4
   379          PAND    X0, X3
   380          MOVO    X1, X5
   381          PSHUFB  X3, X5
   382          PSHUFB  X4, X1
   383          MOVQ    $LeadingByteMask<>(SB), AX
   384          // We now want to mask out the first k bytes of X1/X5, where k is the
   385          // number of overlapping bytes between the last two loads.
   386          //
   387          // DI is the end of the previous load, while SI is the position of the
   388          // current load.  So (SI-DI) is a number in [-16, 0] which is
   389          // -[# of overlapping bytes].
   390          // AX points to 'LeadingByteMask', which is a read-only array where the
   391          // first 16 bytes are 0, and the next 16 bytes are 255.
   392          // So:
   393          // - if SI-DI=-16, LEAQ 16(AX)(SI*1) loads the 16 bytes starting from
   394          //   &(LeadingByteMask[16 + (-16)]).  These are all zero, which is the
   395          //   correct mask when all 16 bytes overlap.
   396          // - if SI-DI=-15, LEAQ 16(AX)(SI*1) loads the 16 bytes starting from
   397          //   &(LeadingByteMask[16 + (-15)]).  The first 15 bytes are zero, and
   398          //   the last byte is 255.  This is the correct mask when the first 15
   399          //   bytes overlap.
   400          // etc.
   401          SUBQ    DI, SI
   402          PADDB   X1, X5
   403          LEAQ    16(AX)(SI*1), DX
   404          MOVOU   (DX), X6
   405          PAND    X6, X5
   406          PADDB   X5, X2
   407          // Now extract results.
   408          PSADBW  X8, X2
   409          PADDQ   X2, X7
   410          MOVQ    X7, BX
   411          PEXTRQ  $1, X7, AX
   412          ADDQ    AX, BX
   413          MOVQ    BX, ret+24(FP)
   414          RET
   415  
   416  countNibblesInSetSSE41Large:
   417          MOVQ    DI, R8
   418          // Stop this loop after 126 rather than 127 iterations.  This way, we
   419          // guarantee there are at least 16 bytes left when we jump to
   420          // countNibblesInSetSSE41Loop.
   421          ADDQ    $2016, R8
   422  countNibblesInSetSSE41LargeLoop:
   423          MOVOU   (DI), X3
   424          MOVO    X3, X4
   425          PSRLQ   $4, X3
   426          PAND    X0, X4
   427          PAND    X0, X3
   428          MOVO    X1, X5
   429          MOVO    X1, X6
   430          PSHUFB  X3, X5
   431          PSHUFB  X4, X6
   432          PADDB   X5, X2
   433          PADDB   X6, X2
   434          ADDQ    $16, DI
   435          CMPQ    R8, DI
   436          JG      countNibblesInSetSSE41LargeLoop
   437  
   438          PSADBW  X8, X2
   439          PADDQ   X2, X7
   440          PXOR    X2, X2
   441          SUBQ    $2016, BX
   442          CMPQ    BX, $2032
   443          JG      countNibblesInSetSSE41Large
   444          JMP     countNibblesInSetSSE41Loop
   445  
   446  TEXT ·countNibblesInTwoSetsSSE41Asm(SB),4,$0-48
   447          // This is a straightforward extension of countNibblesInSetSSE41Asm.
   448          //
   449          // It assumes nSrcByte >= 16.  We can revisit the tiny case if it's
   450          // ever a bottleneck.
   451          MOVQ    cnt2Ptr+0(FP), R10
   452          MOVQ	src+8(FP), DI
   453          MOVQ    table1Ptr+16(FP), DX
   454          MOVQ    table2Ptr+24(FP), R9
   455          MOVQ    nByte+32(FP), BX
   456  
   457          MOVOU   ·Mask0f0f<>(SB), X0
   458  
   459          PXOR    X13, X13
   460  
   461          PXOR    X2, X2
   462          PXOR    X9, X9
   463          LEAQ    -16(DI)(BX*1), SI
   464          MOVOU   (DX), X1
   465          MOVOU   (R9), X8
   466  
   467          PXOR    X7, X7
   468          PXOR    X10, X10
   469          CMPQ    BX, $2032
   470          JG      countNibblesInTwoSetsSSE41Large
   471  
   472  countNibblesInTwoSetsSSE41Loop:
   473          MOVOU   (DI), X3
   474          MOVO    X3, X4
   475          PSRLQ   $4, X3
   476          PAND    X0, X4
   477          PAND    X0, X3
   478          MOVO    X1, X5
   479          MOVO    X1, X6
   480          MOVO    X8, X11
   481          MOVO    X8, X12
   482          PSHUFB  X3, X5
   483          PSHUFB  X4, X6
   484          PSHUFB  X3, X11
   485          PSHUFB  X4, X12
   486          PADDB   X5, X2
   487          PADDB   X11, X9
   488          PADDB   X6, X2
   489          PADDB   X12, X9
   490          ADDQ    $16, DI
   491          CMPQ    SI, DI
   492          JG      countNibblesInTwoSetsSSE41Loop
   493  
   494          MOVOU   (SI), X3
   495          MOVO    X3, X4
   496          PSRLQ   $4, X3
   497          PAND    X0, X4
   498          PAND    X0, X3
   499          MOVO    X1, X5
   500          MOVO    X8, X11
   501          PSHUFB  X3, X5
   502          PSHUFB  X4, X1
   503          PSHUFB  X3, X11
   504          PSHUFB  X4, X8
   505          MOVQ    $LeadingByteMask<>(SB), AX
   506          SUBQ    DI, SI
   507          PADDB   X1, X5
   508          PADDB   X8, X11
   509          LEAQ    16(AX)(SI*1), DX
   510          MOVOU   (DX), X6
   511          PAND    X6, X5
   512          PAND    X6, X11
   513          PADDB   X5, X2
   514          PADDB   X11, X9
   515  
   516          PSADBW  X13, X2
   517          PSADBW  X13, X9
   518          PADDQ   X2, X7
   519          PADDQ   X9, X10
   520          MOVQ    X7, BX
   521          MOVQ    X10, R9
   522          PEXTRQ  $1, X7, AX
   523          ADDQ    (R10), R9
   524          PEXTRQ  $1, X10, R11
   525          ADDQ    AX, BX
   526          ADDQ    R11, R9
   527          MOVQ    R9, (R10)
   528          MOVQ    BX, ret+40(FP)
   529          RET
   530  
   531  countNibblesInTwoSetsSSE41Large:
   532          MOVQ    DI, R8
   533          ADDQ    $2016, R8
   534  countNibblesInTwoSetsSSE41LargeLoop:
   535          MOVOU   (DI), X3
   536          MOVO    X3, X4
   537          PSRLQ   $4, X3
   538          PAND    X0, X4
   539          PAND    X0, X3
   540          MOVO    X1, X5
   541          MOVO    X1, X6
   542          MOVO    X8, X11
   543          MOVO    X8, X12
   544          PSHUFB  X3, X5
   545          PSHUFB  X4, X6
   546          PSHUFB  X3, X11
   547          PSHUFB  X4, X12
   548          PADDB   X5, X2
   549          PADDB   X11, X9
   550          PADDB   X6, X2
   551          PADDB   X12, X9
   552          ADDQ    $16, DI
   553          CMPQ    R8, DI
   554          JG      countNibblesInTwoSetsSSE41LargeLoop
   555  
   556          PSADBW  X13, X2
   557          PSADBW  X13, X9
   558          PADDQ   X2, X7
   559          PADDQ   X9, X10
   560          PXOR    X2, X2
   561          PXOR    X9, X9
   562          SUBQ    $2016, BX
   563          CMPQ    BX, $2032
   564          JG      countNibblesInTwoSetsSSE41Large
   565          JMP     countNibblesInTwoSetsSSE41Loop
   566  
   567  TEXT ·countUnpackedNibblesInSetSSE41Asm(SB),4,$0-32
   568          // This is a slightly simpler variant of countNibblesInSetSSE41Asm (we
   569          // ignore the high bits of each byte).
   570          //
   571          // It assumes nSrcByte >= 16.  We can revisit the tiny case if it's
   572          // ever a bottleneck.
   573          MOVQ	src+0(FP), DI
   574          MOVQ    tablePtr+8(FP), DX
   575          MOVQ    nByte+16(FP), BX
   576  
   577          // Make X8 a permanent all-zero vector.
   578          PXOR    X8, X8
   579  
   580          // Save start of last 16-byte vector.  This is used for both
   581          // loop-comparison and as the final (usually-)unaligned-load address.
   582          LEAQ    -16(DI)(BX*1), SI
   583          MOVOU   (DX), X1
   584  
   585          // X2 is the "inner accumulator".  Each of its bytes is a count from
   586          // 0-255 of how many bases of src[] at the same position mod 16 are
   587          // equal to val.
   588          // Up to 255*16=4080 bytes, this is enough, assuming all table entries
   589          // are in {0, 1}.  Beyond that, the counts might overflow, so we save
   590          // the intermediate results to an "outer accumulator" (X7), as a pair
   591          // of uint64 counts.  Conveniently, the PSADBW instruction directly
   592          // converts the inner count representation to the outer representation,
   593          // though its latency is high enough that it's best avoided in
   594          // innermost loops.
   595          PXOR    X2, X2
   596          PXOR    X7, X7
   597          CMPQ    BX, $4080
   598          JG      countUnpackedNibblesInSetSSE41Large
   599  
   600  countUnpackedNibblesInSetSSE41Loop:
   601          // Load 16 bytes from src.
   602          MOVOU   (DI), X3
   603          // Check set membership.
   604          MOVO    X1, X5
   605          PSHUFB  X3, X5
   606          // X5 now has 1 for set members, and 0 for non-members.  Add to the
   607          // inner accumulator.
   608          PADDB   X5, X2
   609          // Advance to next 16 bytes, check for loop end.
   610          ADDQ    $16, DI
   611          CMPQ    SI, DI
   612          JG      countUnpackedNibblesInSetSSE41Loop
   613  
   614          // Load last 16 bytes from src.  This may overlap with the previous
   615          // main-loop load.
   616          MOVOU   (SI), X3
   617          PSHUFB  X3, X1
   618          MOVQ    $LeadingByteMask<>(SB), AX
   619          // We now want to mask out the first k bytes of X1, where k is the
   620          // number of overlapping bytes between the last two loads.
   621          //
   622          // DI is the end of the previous load, while SI is the position of the
   623          // current load.  So (SI-DI) is a number in [-16, 0] which is
   624          // -[# of overlapping bytes].
   625          // AX points to 'LeadingByteMask', which is a read-only array where the
   626          // first 16 bytes are 0, and the next 16 bytes are 255.
   627          // So:
   628          // - if SI-DI=-16, LEAQ 16(AX)(SI*1) loads the 16 bytes starting from
   629          //   &(LeadingByteMask[16 + (-16)]).  These are all zero, which is the
   630          //   correct mask when all 16 bytes overlap.
   631          // - if SI-DI=-15, LEAQ 16(AX)(SI*1) loads the 16 bytes starting from
   632          //   &(LeadingByteMask[16 + (-15)]).  The first 15 bytes are zero, and
   633          //   the last byte is 255.  This is the correct mask when the first 15
   634          //   bytes overlap.
   635          // etc.
   636          SUBQ    DI, SI
   637          LEAQ    16(AX)(SI*1), DX
   638          MOVOU   (DX), X6
   639          PAND    X6, X1
   640          PADDB   X1, X2
   641          // Now extract results.
   642          PSADBW  X8, X2
   643          PADDQ   X2, X7
   644          MOVQ    X7, BX
   645          PEXTRQ  $1, X7, AX
   646          ADDQ    AX, BX
   647          MOVQ    BX, ret+24(FP)
   648          RET
   649  
   650  countUnpackedNibblesInSetSSE41Large:
   651          MOVQ    DI, R8
   652          // Stop this loop after 254 rather than 255 iterations.  This way, we
   653          // guarantee there are at least 16 bytes left when we jump to
   654          // countUnpackedNibblesInSetSSE41Loop.
   655          ADDQ    $4064, R8
   656  countUnpackedNibblesInSetSSE41LargeLoop:
   657          MOVOU   (DI), X3
   658          MOVO    X1, X5
   659          PSHUFB  X3, X5
   660          PADDB   X5, X2
   661          ADDQ    $16, DI
   662          CMPQ    R8, DI
   663          JG      countUnpackedNibblesInSetSSE41LargeLoop
   664  
   665          PSADBW  X8, X2
   666          PADDQ   X2, X7
   667          PXOR    X2, X2
   668          SUBQ    $4064, BX
   669          CMPQ    BX, $4080
   670          JG      countUnpackedNibblesInSetSSE41Large
   671          JMP     countUnpackedNibblesInSetSSE41Loop
   672  
   673  TEXT ·countUnpackedNibblesInTwoSetsSSE41Asm(SB),4,$0-48
   674          // This is a slightly simpler variant of countNibblesInTwoSetsSSE41Asm
   675          // (we ignore the high bits of each byte).
   676          //
   677          // It assumes nSrcByte >= 16.  We can revisit the tiny case if it's
   678          // ever a bottleneck.
   679          MOVQ    cnt2Ptr+0(FP), R10
   680          MOVQ	src+8(FP), DI
   681          MOVQ    table1Ptr+16(FP), DX
   682          MOVQ    table2Ptr+24(FP), R9
   683          MOVQ    nByte+32(FP), BX
   684  
   685          PXOR    X13, X13
   686          PXOR    X2, X2
   687          PXOR    X9, X9
   688          LEAQ    -16(DI)(BX*1), SI
   689          MOVOU   (DX), X1
   690          MOVOU   (R9), X8
   691  
   692          PXOR    X7, X7
   693          PXOR    X10, X10
   694          CMPQ    BX, $4080
   695          JG      countUnpackedNibblesInTwoSetsSSE41Large
   696  
   697  countUnpackedNibblesInTwoSetsSSE41Loop:
   698          MOVOU   (DI), X3
   699          MOVO    X1, X5
   700          MOVO    X8, X11
   701          PSHUFB  X3, X5
   702          PSHUFB  X3, X11
   703          PADDB   X5, X2
   704          PADDB   X11, X9
   705          ADDQ    $16, DI
   706          CMPQ    SI, DI
   707          JG      countUnpackedNibblesInTwoSetsSSE41Loop
   708  
   709          MOVOU   (SI), X3
   710          PSHUFB  X3, X1
   711          PSHUFB  X3, X8
   712          MOVQ    $LeadingByteMask<>(SB), AX
   713          SUBQ    DI, SI
   714          LEAQ    16(AX)(SI*1), DX
   715          MOVOU   (DX), X6
   716          PAND    X6, X1
   717          PAND    X6, X8
   718          PADDB   X1, X2
   719          PADDB   X8, X9
   720  
   721          PSADBW  X13, X2
   722          PSADBW  X13, X9
   723          PADDQ   X2, X7
   724          PADDQ   X9, X10
   725          MOVQ    X7, BX
   726          MOVQ    X10, R9
   727          PEXTRQ  $1, X7, AX
   728          ADDQ    (R10), R9
   729          PEXTRQ  $1, X10, R11
   730          ADDQ    AX, BX
   731          ADDQ    R11, R9
   732          MOVQ    R9, (R10)
   733          MOVQ    BX, ret+40(FP)
   734          RET
   735  
   736  countUnpackedNibblesInTwoSetsSSE41Large:
   737          MOVQ    DI, R8
   738          ADDQ    $4064, R8
   739  countUnpackedNibblesInTwoSetsSSE41LargeLoop:
   740          MOVOU   (DI), X3
   741          MOVO    X1, X5
   742          MOVO    X8, X11
   743          PSHUFB  X3, X5
   744          PSHUFB  X3, X11
   745          PADDB   X5, X2
   746          PADDB   X11, X9
   747          ADDQ    $16, DI
   748          CMPQ    R8, DI
   749          JG      countUnpackedNibblesInTwoSetsSSE41LargeLoop
   750  
   751          PSADBW  X13, X2
   752          PSADBW  X13, X9
   753          PADDQ   X2, X7
   754          PADDQ   X9, X10
   755          PXOR    X2, X2
   756          PXOR    X9, X9
   757          SUBQ    $4064, BX
   758          CMPQ    BX, $4080
   759          JG      countUnpackedNibblesInTwoSetsSSE41Large
   760          JMP     countUnpackedNibblesInTwoSetsSSE41Loop
   761  
   762  TEXT ·accumulate8SSE41Asm(SB),4,$0-24
   763          // This assumes nByte >= 16.
   764          MOVQ	src+0(FP), DI
   765          MOVQ    nByte+8(FP), BX
   766  
   767          // X0 is a pair of uint64s containing partial sums.
   768          PXOR    X0, X0
   769          // X1 is a fixed all-zero vector.
   770          PXOR    X1, X1
   771          // SI points to 32 bytes before the end of src[].
   772          // (2x unroll improves the long-array benchmark by ~7%.)
   773          LEAQ    -32(DI)(BX*1), SI
   774          CMPQ    SI, DI
   775          JLE     accumulate8SSE41Final32
   776  
   777  accumulate8SSE41Loop:
   778          MOVOU   (DI), X2
   779          MOVOU   16(DI), X3
   780          PSADBW  X1, X2
   781          PSADBW  X1, X3
   782          ADDQ    $32, DI
   783          PADDQ   X2, X0
   784          PADDQ   X3, X0
   785          CMPQ    SI, DI
   786          JG      accumulate8SSE41Loop
   787  
   788  accumulate8SSE41Final32:
   789          ADDQ    $16, SI
   790          CMPQ    SI, DI
   791          JLE     accumulate8SSE41Final16
   792          MOVOU   (DI), X2
   793          PSADBW  X1, X2
   794          ADDQ    $16, DI
   795          PADDQ   X2, X0
   796  
   797  accumulate8SSE41Final16:
   798          // Load last bytes, use LeadingByteMask to avoid double-counting.
   799          MOVOU   (SI), X2
   800          MOVQ    $LeadingByteMask<>(SB), AX
   801          SUBQ    DI, SI
   802          LEAQ    16(AX)(SI*1), DX
   803          MOVOU   (DX), X3
   804          PAND    X3, X2
   805          PSADBW  X1, X2
   806          PADDQ   X2, X0
   807  
   808          // Extract final sum.
   809          MOVQ    X0, BX
   810          PEXTRQ  $1, X0, AX
   811          ADDQ    AX, BX
   812          MOVQ    BX, ret+16(FP)
   813          RET
   814  
   815  TEXT ·accumulate8GreaterSSE41Asm(SB),4,$0-32
   816          // Variant of accumulate8 that masks out bytes <= the given value.
   817          // If all bytes < 128, it is possible to speed this up by ~2-7% by
   818          // replacing the saturating-subtract + equality-to-zero combination
   819          // with a single _mm_cmpgt_epi8() operation.  But this is supposed to
   820          // be a safe function, so I don't think that minor gain is worth the
   821          // unvalidated condition.
   822          //
   823          // This assumes nByte >= 16.
   824          MOVQ	src+0(FP), DI
   825          MOVD    val+8(FP), X4
   826          MOVQ    nByte+16(FP), BX
   827  
   828          // X1 is a fixed all-zero vector.
   829          PXOR    X1, X1
   830          // X0 is a pair of uint64s containing partial sums.
   831          PXOR    X0, X0
   832  
   833          PSHUFB  X1, X4
   834          // X4 now has all bytes equal to val.
   835  
   836          // SI points to 16 bytes before the end of src[].
   837          LEAQ    -16(DI)(BX*1), SI
   838  
   839  accumulate8GreaterSSE41Loop:
   840          MOVOU   (DI), X2
   841          MOVO    X2, X5
   842          PSUBUSB X4, X2
   843          // X2 is 0 for all bytes originally <= val, and nonzero elsewhere.
   844          PCMPEQB X1, X2
   845          // X2 is now 255 for all bytes originally <= val, and 0 elsewhere.
   846          PANDN   X5, X2
   847          PSADBW  X1, X2
   848          ADDQ    $16, DI
   849          PADDQ   X2, X0
   850          CMPQ    SI, DI
   851          JG      accumulate8GreaterSSE41Loop
   852  
   853          // Load last bytes, use LeadingByteMask to avoid double-counting.
   854          MOVOU   (SI), X2
   855          MOVQ    $LeadingByteMask<>(SB), AX
   856          MOVO    X2, X5
   857          SUBQ    DI, SI
   858          PSUBUSB X4, X2
   859          LEAQ    16(AX)(SI*1), DX
   860          PCMPEQB X1, X2
   861          MOVOU   (DX), X3
   862          PANDN   X5, X2
   863          PAND    X3, X2
   864          PSADBW  X1, X2
   865          PADDQ   X2, X0
   866  
   867          // Extract final sum.
   868          MOVQ    X0, BX
   869          PEXTRQ  $1, X0, AX
   870          ADDQ    AX, BX
   871          MOVQ    BX, ret+24(FP)
   872          RET