github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/bloom/block_amd64.s (about)

     1  //go:build !purego
     2  
     3  #include "textflag.h"
     4  
     5  #define salt0 0x47b6137b
     6  #define salt1 0x44974d91
     7  #define salt2 0x8824ad5b
     8  #define salt3 0xa2b7289d
     9  #define salt4 0x705495c7
    10  #define salt5 0x2df1424b
    11  #define salt6 0x9efc4947
    12  #define salt7 0x5c6bfb31
    13  
    14  DATA ones+0(SB)/4, $1
    15  DATA ones+4(SB)/4, $1
    16  DATA ones+8(SB)/4, $1
    17  DATA ones+12(SB)/4, $1
    18  DATA ones+16(SB)/4, $1
    19  DATA ones+20(SB)/4, $1
    20  DATA ones+24(SB)/4, $1
    21  DATA ones+28(SB)/4, $1
    22  GLOBL ones(SB), RODATA|NOPTR, $32
    23  
    24  DATA salt+0(SB)/4, $salt0
    25  DATA salt+4(SB)/4, $salt1
    26  DATA salt+8(SB)/4, $salt2
    27  DATA salt+12(SB)/4, $salt3
    28  DATA salt+16(SB)/4, $salt4
    29  DATA salt+20(SB)/4, $salt5
    30  DATA salt+24(SB)/4, $salt6
    31  DATA salt+28(SB)/4, $salt7
    32  GLOBL salt(SB), RODATA|NOPTR, $32
    33  
    34  // This initial block is a SIMD implementation of the mask function declared in
    35  // block_default.go and block_optimized.go. For each of the 8 x 32 bits words of
    36  // the bloom filter block, the operation performed is:
    37  //
    38  //      block[i] = 1 << ((x * salt[i]) >> 27)
    39  //
    40  // Arguments
    41  // ---------
    42  //
    43  // * src is a memory location where the value to use when computing the mask is
    44  //   located. The memory location is not modified.
    45  //
    46  // * tmp is a YMM register used as scratch space to hold intermediary results in
    47  //   the algorithm.
    48  //
    49  // * dst is a YMM register where the final mask is written.
    50  //
    51  #define generateMask(src, tmp, dst) \
    52      VMOVDQA ones(SB), dst \
    53      VPBROADCASTD src, tmp \
    54      VPMULLD salt(SB), tmp, tmp \
    55      VPSRLD $27, tmp, tmp \
    56      VPSLLVD tmp, dst, dst
    57  
    58  #define insert(salt, src, dst) \
    59      MOVL src, CX \
    60      IMULL salt, CX \
    61      SHRL $27, CX \
    62      MOVL $1, DX \
    63      SHLL CX, DX \
    64      ORL DX, dst
    65  
    66  #define check(salt, b, x) \
    67      MOVL b, CX \
    68      MOVL x, DX \
    69      IMULL salt, DX \
    70      SHRL $27, DX \
    71      BTL DX, CX \
    72      JAE notfound
    73  
    74  // func blockInsert(b *Block, x uint32)
    75  TEXT ·blockInsert(SB), NOSPLIT, $0-16
    76      MOVQ b+0(FP), AX
    77      CMPB ·hasAVX2(SB), $0
    78      JE fallback
    79  avx2:
    80      generateMask(x+8(FP), Y1, Y0)
    81      // Set all 1 bits of the mask in the bloom filter block.
    82      VPOR (AX), Y0, Y0
    83      VMOVDQU Y0, (AX)
    84      VZEROUPPER
    85      RET
    86  fallback:
    87      MOVL x+8(FP), BX
    88      insert($salt0, BX, 0(AX))
    89      insert($salt1, BX, 4(AX))
    90      insert($salt2, BX, 8(AX))
    91      insert($salt3, BX, 12(AX))
    92      insert($salt4, BX, 16(AX))
    93      insert($salt5, BX, 20(AX))
    94      insert($salt6, BX, 24(AX))
    95      insert($salt7, BX, 28(AX))
    96      RET
    97  
    98  // func blockCheck(b *Block, x uint32) bool
    99  TEXT ·blockCheck(SB), NOSPLIT, $0-17
   100      MOVQ b+0(FP), AX
   101      CMPB ·hasAVX2(SB), $0
   102      JE fallback
   103  avx2:
   104      generateMask(x+8(FP), Y1, Y0)
   105      // Compare the 1 bits of the mask with the bloom filter block, then compare
   106      // the result with the mask, expecting equality if the value `x` was present
   107      // in the block.
   108      VPAND (AX), Y0, Y1 // Y0 = block & mask
   109      VPTEST Y0, Y1      // if (Y0 & ^Y1) != 0 { CF = 1 }
   110      SETCS ret+16(FP)   // return CF == 1
   111      VZEROUPPER
   112      RET
   113  fallback:
   114      MOVL x+8(FP), BX
   115      check($salt0, 0(AX), BX)
   116      check($salt1, 4(AX), BX)
   117      check($salt2, 8(AX), BX)
   118      check($salt3, 12(AX), BX)
   119      check($salt4, 16(AX), BX)
   120      check($salt5, 20(AX), BX)
   121      check($salt6, 24(AX), BX)
   122      check($salt7, 28(AX), BX)
   123      MOVB $1, CX
   124      JMP done
   125  notfound:
   126      XORB CX, CX
   127  done:
   128      MOVB CX, ret+16(FP)
   129      RET