github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/hashprobe/hashprobe_amd64.s (about)

     1  //go:build !purego
     2  
     3  #include "textflag.h"
     4  
     5  // This version of the probing algorithm for 32 bit keys takes advantage of
     6  // the memory layout of table groups and SIMD instructions to accelerate the
     7  // probing operations.
     8  //
     9  // The first 32 bytes of a table group contain the bit mask indicating which
    10  // slots are in use, and the array of keys, which fits into a single vector
    11  // register (YMM) and can be loaded and tested with a single instruction.
    12  //
    13  // A first version of the table group used the number of keys held in the
    14  // group instead of a bit mask, which required the probing operation to
    15  // reconstruct the bit mask during the lookup operation in order to identify
    16  // which elements of the VPCMPEQD result should be retained. The extra CPU
    17  // instructions used to reconstruct the bit mask had a measurable overhead.
    18  // By holding the bit mask in the data structure, we can determine the number
    19  // of keys in a group using the POPCNT instruction, and avoid recomputing the
    20  // mask during lookups.
    21  //
    22  // func multiProbe32AVX2(table []table32Group, numKeys int, hashes []uintptr, keys sparse.Uint32Array, values []int32) int
    23  TEXT ·multiProbe32AVX2(SB), NOSPLIT, $0-112
    24      MOVQ table_base+0(FP), AX
    25      MOVQ table_len+8(FP), BX
    26      MOVQ numKeys+24(FP), CX
    27      MOVQ hashes_base+32(FP), DX
    28      MOVQ hashes_len+40(FP), DI
    29      MOVQ keys_array_ptr+56(FP), R8
    30      MOVQ keys_array_off+72(FP), R15
    31      MOVQ values_base+80(FP), R9
    32      DECQ BX // modulo = len(table) - 1
    33  
    34      XORQ SI, SI
    35      JMP test
    36  loop:
    37      MOVQ (DX)(SI*8), R10  // hash
    38      VPBROADCASTD (R8), Y0 // [key]
    39  probe:
    40      MOVQ R10, R11
    41      ANDQ BX, R11 // hash & modulo
    42      SHLQ $6, R11 // x 64 (size of table32Group)
    43      LEAQ (AX)(R11*1), R12
    44  
    45      VMOVDQU (R12), Y1
    46      VPCMPEQD Y0, Y1, Y2
    47      VMOVMSKPS Y2, R11
    48      MOVL 56(R12), R13
    49      TESTL R11, R13
    50      JZ insert
    51  
    52      TZCNTL R11, R13
    53      MOVL 28(R12)(R13*4), R14
    54  next:
    55      MOVL R14, (R9)(SI*4)
    56      INCQ SI
    57      ADDQ R15, R8
    58  test:
    59      CMPQ SI, DI
    60      JNE loop
    61      MOVQ CX, ret+104(FP)
    62      VZEROUPPER
    63      RET
    64  insert:
    65      CMPL R13, $0b1111111
    66      JE probeNextGroup
    67  
    68      MOVL R13, R11
    69      POPCNTL R13, R13
    70      MOVQ X0, R14 // key
    71      SHLL $1, R11
    72      ORL $1, R11
    73      MOVL R11, 56(R12)       // group.len = (group.len << 1) | 1
    74      MOVL R14, (R12)(R13*4)  // group.keys[i] = key
    75      MOVL CX, 28(R12)(R13*4) // group.values[i] = value
    76      MOVL CX, R14
    77      INCL CX
    78      JMP next
    79  probeNextGroup:
    80      INCQ R10
    81      JMP probe
    82  
    83  // func multiProbe64AVX2(table []table64Group, numKeys int, hashes []uintptr, keys sparse.Uint64Array, values []int32) int
    84  TEXT ·multiProbe64AVX2(SB), NOSPLIT, $0-112
    85      MOVQ table_base+0(FP), AX
    86      MOVQ table_len+8(FP), BX
    87      MOVQ numKeys+24(FP), CX
    88      MOVQ hashes_base+32(FP), DX
    89      MOVQ hashes_len+40(FP), DI
    90      MOVQ keys_array_ptr+56(FP), R8
    91      MOVQ keys_array_off+72(FP), R15
    92      MOVQ values_base+80(FP), R9
    93      DECQ BX // modulo = len(table) - 1
    94  
    95      XORQ SI, SI
    96      JMP test
    97  loop:
    98      MOVQ (DX)(SI*8), R10        // hash
    99      VPBROADCASTQ (R8), Y0 // [key]
   100  probe:
   101      MOVQ R10, R11
   102      ANDQ BX, R11 // hash & modulo
   103      SHLQ $6, R11 // x 64 (size of table64Group)
   104      LEAQ (AX)(R11*1), R12
   105  
   106      VMOVDQU (R12), Y1
   107      VPCMPEQQ Y0, Y1, Y2
   108      VMOVMSKPD Y2, R11
   109      MOVL 48(R12), R13
   110      TESTL R11, R13
   111      JZ insert
   112  
   113      TZCNTL R11, R13
   114      MOVL 32(R12)(R13*4), R14
   115  next:
   116      MOVL R14, (R9)(SI*4)
   117      INCQ SI
   118      ADDQ R15, R8
   119  test:
   120      CMPQ SI, DI
   121      JNE loop
   122      MOVQ CX, ret+104(FP)
   123      VZEROUPPER
   124      RET
   125  insert:
   126      CMPL R13, $0b1111
   127      JE probeNextGroup
   128  
   129      MOVL R13, R11
   130      POPCNTL R13, R13
   131      SHLL $1, R11
   132      ORL $1, R11
   133      MOVL R11, 48(R12)       // group.len = (group.len << 1) | 1
   134      MOVQ X0, (R12)(R13*8)   // group.keys[i] = key
   135      MOVL CX, 32(R12)(R13*4) // group.values[i] = value
   136      MOVL CX, R14
   137      INCL CX
   138      JMP next
   139  probeNextGroup:
   140      INCQ R10
   141      JMP probe
   142  
   143  // func multiProbe128SSE2(table []byte, tableCap, tableLen int, hashes []uintptr, keys sparse.Uint128Array, values []int32) int
   144  TEXT ·multiProbe128SSE2(SB), NOSPLIT, $0-120
   145      MOVQ table_base+0(FP), AX
   146      MOVQ tableCap+24(FP), BX
   147      MOVQ tableLen+32(FP), CX
   148      MOVQ hashes_base+40(FP), DX
   149      MOVQ hashes_len+48(FP), DI
   150      MOVQ keys_array_ptr+64(FP), R8
   151      MOVQ keys_array_off+80(FP), R15
   152      MOVQ values_base+88(FP), R9
   153  
   154      MOVQ BX, R10
   155      SHLQ $4, R10
   156      LEAQ (AX)(R10*1), R10
   157      DECQ BX // modulo = tableCap - 1
   158  
   159      XORQ SI, SI
   160      JMP test
   161  loop:
   162      MOVQ (DX)(SI*8), R11 // hash
   163      MOVOU (R8), X0       // key
   164  probe:
   165      MOVQ R11, R12
   166      ANDQ BX, R12
   167  
   168      MOVL (R10)(R12*4), R14
   169      CMPL R14, $0
   170      JE insert
   171  
   172      SHLQ $4, R12
   173      MOVOU (AX)(R12*1), X1
   174      PCMPEQL X0, X1
   175      MOVMSKPS X1, R13
   176      CMPL R13, $0b1111
   177      JE next
   178  
   179      INCQ R11
   180      JMP probe
   181  next:
   182      DECL R14
   183      MOVL R14, (R9)(SI*4)
   184      INCQ SI
   185      ADDQ R15, R8
   186  test:
   187      CMPQ SI, DI
   188      JNE loop
   189      MOVQ CX, ret+112(FP)
   190      RET
   191  insert:
   192      INCL CX
   193      MOVL CX, (R10)(R12*4)
   194      MOVL CX, R14
   195      SHLQ $4, R12
   196      MOVOU X0, (AX)(R12*1)
   197      JMP next