github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/bloom/filter_amd64.s (about)

     1  //go:build !purego
     2  
     3  #include "textflag.h"
     4  
     5  #define salt0 0x47b6137b
     6  #define salt1 0x44974d91
     7  #define salt2 0x8824ad5b
     8  #define salt3 0xa2b7289d
     9  #define salt4 0x705495c7
    10  #define salt5 0x2df1424b
    11  #define salt6 0x9efc4947
    12  #define salt7 0x5c6bfb31
    13  
    14  // See block_amd64.s for a description of this algorithm.
    15  #define generateMask(src, dst) \
    16      VMOVDQA ones(SB), dst \
    17      VPMULLD salt(SB), src, src \
    18      VPSRLD $27, src, src \
    19      VPSLLVD src, dst, dst
    20  
    21  #define applyMask(src, dst) \
    22      VPOR dst, src, src \
    23      VMOVDQU src, dst
    24  
    25  #define fasthash1x64(scale, value) \
    26      SHRQ $32, value \
    27      IMULQ scale, value \
    28      SHRQ $32, value \
    29      SHLQ $5, value
    30  
    31  #define fasthash4x64(scale, value) \
    32      VPSRLQ $32, value, value \
    33      VPMULUDQ scale, value, value \
    34      VPSRLQ $32, value, value \
    35      VPSLLQ $5, value, value
    36  
    37  #define extract4x64(srcYMM, srcXMM, tmpXMM, r0, r1, r2, r3) \
    38      VEXTRACTI128 $1, srcYMM, tmpXMM \
    39      MOVQ srcXMM, r0 \
    40      VPEXTRQ $1, srcXMM, r1 \
    41      MOVQ tmpXMM, r2 \
    42      VPEXTRQ $1, tmpXMM, r3
    43  
    44  #define insert(salt, src, dst) \
    45      MOVL src, CX \
    46      IMULL salt, CX \
    47      SHRL $27, CX \
    48      MOVL $1, DX \
    49      SHLL CX, DX \
    50      ORL DX, dst
    51  
    52  #define check(salt, b, x) \
    53      MOVL b, CX \
    54      MOVL x, DX \
    55      IMULL salt, DX \
    56      SHRL $27, DX \
    57      BTL DX, CX \
    58      JAE notfound
    59  
    60  // func filterInsertBulk(f []Block, x []uint64)
    61  TEXT ·filterInsertBulk(SB), NOSPLIT, $0-48
    62      MOVQ f_base+0(FP), AX
    63      MOVQ f_len+8(FP), CX
    64      MOVQ x_base+24(FP), BX
    65      MOVQ x_len+32(FP), DX
    66      CMPB ·hasAVX2(SB), $0
    67      JE fallback
    68  avx2:
    69      VPBROADCASTQ f_base+8(FP), Y0
    70      // Loop initialization, SI holds the current index in `x`, DI is the number
    71      // of elements in `x` rounded down to the nearest multiple of 4.
    72      XORQ SI, SI
    73      MOVQ DX, DI
    74      SHRQ $2, DI
    75      SHLQ $2, DI
    76  avx2loop4x64:
    77      CMPQ SI, DI
    78      JAE avx2loop1x64
    79  
    80      // The masks and indexes for 4 input hashes are computed in each loop
    81      // iteration. The hashes are loaded in Y1 so we can use vector instructions
    82      // to compute all 4 indexes in parallel. The lower 32 bits of the hashes are
    83      // also broadcasted in 4 YMM registers to compute the 4 masks that will then
    84      // be applied to the filter.
    85      VMOVDQU (BX)(SI*8), Y1
    86      VPBROADCASTD 0(BX)(SI*8), Y2
    87      VPBROADCASTD 8(BX)(SI*8), Y3
    88      VPBROADCASTD 16(BX)(SI*8), Y4
    89      VPBROADCASTD 24(BX)(SI*8), Y5
    90  
    91      fasthash4x64(Y0, Y1)
    92      generateMask(Y2, Y6)
    93      generateMask(Y3, Y7)
    94      generateMask(Y4, Y8)
    95      generateMask(Y5, Y9)
    96  
    97      // The next block of instructions move indexes from the vector to general
    98      // purpose registers in order to use them as offsets when applying the mask
    99      // to the filter.
   100      extract4x64(Y1, X1, X10, R8, R9, R10, R11)
   101  
   102      // Apply masks to the filter; this operation is sensitive to aliasing, when
   103      // blocks overlap the, CPU has to serialize the reads and writes, which has
   104      // a measurable impact on throughput. This would be frequent for small bloom
   105      // filters which may have only a few blocks, the probability of seeing
   106      // overlapping blocks on large filters should be small enough to make this
   107      // a non-issue though.
   108      applyMask(Y6, (AX)(R8*1))
   109      applyMask(Y7, (AX)(R9*1))
   110      applyMask(Y8, (AX)(R10*1))
   111      applyMask(Y9, (AX)(R11*1))
   112  
   113      ADDQ $4, SI
   114      JMP avx2loop4x64
   115  avx2loop1x64:
   116      // Compute trailing elements in `x` if the length was not a multiple of 4.
   117      // This is the same algorithm as the one in the loop4x64 section, working
   118      // on a single mask/block pair at a time.
   119      CMPQ SI, DX
   120      JE avx2done
   121      MOVQ (BX)(SI*8), R8
   122      VPBROADCASTD (BX)(SI*8), Y0
   123      fasthash1x64(CX, R8)
   124      generateMask(Y0, Y1)
   125      applyMask(Y1, (AX)(R8*1))
   126      INCQ SI
   127      JMP avx2loop1x64
   128  avx2done:
   129      VZEROUPPER
   130      JMP done
   131  fallback:
   132      XORQ SI, SI
   133      MOVQ DX, DI
   134      MOVQ CX, R10
   135  loop:
   136      CMPQ SI, DI
   137      JE done
   138      MOVLQZX (BX)(SI*8), R8
   139      MOVQ (BX)(SI*8), R9
   140      fasthash1x64(R10, R9)
   141      insert($salt0, R8, 0(AX)(R9*1))
   142      insert($salt1, R8, 4(AX)(R9*1))
   143      insert($salt2, R8, 8(AX)(R9*1))
   144      insert($salt3, R8, 12(AX)(R9*1))
   145      insert($salt4, R8, 16(AX)(R9*1))
   146      insert($salt5, R8, 20(AX)(R9*1))
   147      insert($salt6, R8, 24(AX)(R9*1))
   148      insert($salt7, R8, 28(AX)(R9*1))
   149      INCQ SI
   150      JMP loop
   151  done:
   152      RET
   153  
   154  // func filterInsert(f []Block, x uint64)
   155  TEXT ·filterInsert(SB), NOSPLIT, $0-32
   156      MOVQ f_base+0(FP), AX
   157      MOVQ f_len+8(FP), BX
   158      MOVQ x+24(FP), CX
   159      fasthash1x64(BX, CX)
   160      CMPB ·hasAVX2(SB), $0
   161      JE fallback
   162  avx2:
   163      VPBROADCASTD x+24(FP), Y1
   164      generateMask(Y1, Y0)
   165      applyMask(Y0, (AX)(CX*1))
   166      VZEROUPPER
   167      RET
   168  fallback:
   169      ADDQ CX, AX
   170      MOVL x+24(FP), BX
   171      insert($salt0, BX, 0(AX))
   172      insert($salt1, BX, 4(AX))
   173      insert($salt2, BX, 8(AX))
   174      insert($salt3, BX, 12(AX))
   175      insert($salt4, BX, 16(AX))
   176      insert($salt5, BX, 20(AX))
   177      insert($salt6, BX, 24(AX))
   178      insert($salt7, BX, 28(AX))
   179      RET
   180  
   181  // func filterCheck(f []Block, x uint64) bool
   182  TEXT ·filterCheck(SB), NOSPLIT, $0-33
   183      MOVQ f_base+0(FP), AX
   184      MOVQ f_len+8(FP), BX
   185      MOVQ x+24(FP), CX
   186      fasthash1x64(BX, CX)
   187      CMPB ·hasAVX2(SB), $0
   188      JE fallback
   189  avx2:
   190      VPBROADCASTD x+24(FP), Y1
   191      generateMask(Y1, Y0)
   192      VPAND (AX)(CX*1), Y0, Y1
   193      VPTEST Y0, Y1
   194      SETCS ret+32(FP)
   195      VZEROUPPER
   196      RET
   197  fallback:
   198      ADDQ CX, AX
   199      MOVL x+24(FP), BX
   200      check($salt0, 0(AX), BX)
   201      check($salt1, 4(AX), BX)
   202      check($salt2, 8(AX), BX)
   203      check($salt3, 12(AX), BX)
   204      check($salt4, 16(AX), BX)
   205      check($salt5, 20(AX), BX)
   206      check($salt6, 24(AX), BX)
   207      check($salt7, 28(AX), BX)
   208      MOVB $1, CX
   209      JMP done
   210  notfound:
   211      XORB CX, CX
   212  done:
   213      MOVB CX, ret+32(FP)
   214      RET