github.com/MerlinKodo/sing-tun@v0.1.15/internal/clashtcpip/tcpip_amd64.s (about)

     1  #include "textflag.h"
     2  
     3  DATA endian_swap_mask<>+0(SB)/8, $0x607040502030001
     4  DATA endian_swap_mask<>+8(SB)/8, $0xE0F0C0D0A0B0809
     5  DATA endian_swap_mask<>+16(SB)/8, $0x607040502030001
     6  DATA endian_swap_mask<>+24(SB)/8, $0xE0F0C0D0A0B0809
     7  GLOBL endian_swap_mask<>(SB), RODATA, $32
     8  
     9  // func sumAsmAvx2(data unsafe.Pointer, length uintptr) uintptr
    10  //
    11  // args (8 bytes aligned):
    12  //   data   unsafe.Pointer - 8 bytes - 0 offset
    13  //   length uintptr        - 8 bytes - 8 offset
    14  //   result uintptr        - 8 bytes - 16 offset
    15  #define PDATA   AX
    16  #define LENGTH  CX
    17  #define RESULT  BX
    18  TEXT ·sumAsmAvx2(SB),NOSPLIT,$0-24
    19      MOVQ data+0(FP), PDATA
    20      MOVQ length+8(FP), LENGTH
    21      XORQ RESULT, RESULT
    22  
    23  #define VSUM             Y0
    24  #define ENDIAN_SWAP_MASK Y1
    25  BEGIN:
    26      VMOVDQU endian_swap_mask<>(SB), ENDIAN_SWAP_MASK
    27      VPXOR VSUM, VSUM, VSUM
    28  
    29  #define LOADED_0 Y2
    30  #define LOADED_1 Y3
    31  #define LOADED_2 Y4
    32  #define LOADED_3 Y5
    33  BATCH_64:
    34      CMPQ LENGTH, $64
    35      JB BATCH_32
    36      VPMOVZXWD (PDATA), LOADED_0
    37      VPMOVZXWD 16(PDATA), LOADED_1
    38      VPMOVZXWD 32(PDATA), LOADED_2
    39      VPMOVZXWD 48(PDATA), LOADED_3
    40      VPSHUFB ENDIAN_SWAP_MASK, LOADED_0, LOADED_0
    41      VPSHUFB ENDIAN_SWAP_MASK, LOADED_1, LOADED_1
    42      VPSHUFB ENDIAN_SWAP_MASK, LOADED_2, LOADED_2
    43      VPSHUFB ENDIAN_SWAP_MASK, LOADED_3, LOADED_3
    44      VPADDD LOADED_0, VSUM, VSUM
    45      VPADDD LOADED_1, VSUM, VSUM
    46      VPADDD LOADED_2, VSUM, VSUM
    47      VPADDD LOADED_3, VSUM, VSUM
    48      ADDQ $-64, LENGTH
    49      ADDQ $64, PDATA
    50      JMP BATCH_64
    51  #undef LOADED_0
    52  #undef LOADED_1
    53  #undef LOADED_2
    54  #undef LOADED_3
    55  
    56  #define LOADED_0 Y2
    57  #define LOADED_1 Y3
    58  BATCH_32:
    59      CMPQ LENGTH, $32
    60      JB BATCH_16
    61      VPMOVZXWD (PDATA), LOADED_0
    62      VPMOVZXWD 16(PDATA), LOADED_1
    63      VPSHUFB ENDIAN_SWAP_MASK, LOADED_0, LOADED_0
    64      VPSHUFB ENDIAN_SWAP_MASK, LOADED_1, LOADED_1
    65      VPADDD LOADED_0, VSUM, VSUM
    66      VPADDD LOADED_1, VSUM, VSUM
    67      ADDQ $-32, LENGTH
    68      ADDQ $32, PDATA
    69      JMP BATCH_32
    70  #undef LOADED_0
    71  #undef LOADED_1
    72  
    73  #define LOADED Y2
    74  BATCH_16:
    75      CMPQ LENGTH, $16
    76      JB COLLECT
    77      VPMOVZXWD (PDATA), LOADED
    78      VPSHUFB ENDIAN_SWAP_MASK, LOADED, LOADED
    79      VPADDD LOADED, VSUM, VSUM
    80      ADDQ $-16, LENGTH
    81      ADDQ $16, PDATA
    82      JMP BATCH_16
    83  #undef LOADED
    84  
    85  #define EXTRACTED Y2
    86  #define EXTRACTED_128 X2
    87  #define TEMP_64 DX
    88  COLLECT:
    89      VEXTRACTI128 $0, VSUM, EXTRACTED_128
    90      VPEXTRD $0, EXTRACTED_128, TEMP_64
    91      ADDL TEMP_64, RESULT
    92      VPEXTRD $1, EXTRACTED_128, TEMP_64
    93      ADDL TEMP_64, RESULT
    94      VPEXTRD $2, EXTRACTED_128, TEMP_64
    95      ADDL TEMP_64, RESULT
    96      VPEXTRD $3, EXTRACTED_128, TEMP_64
    97      ADDL TEMP_64, RESULT
    98      VEXTRACTI128 $1, VSUM, EXTRACTED_128
    99      VPEXTRD $0, EXTRACTED_128, TEMP_64
   100      ADDL TEMP_64, RESULT
   101      VPEXTRD $1, EXTRACTED_128, TEMP_64
   102      ADDL TEMP_64, RESULT
   103      VPEXTRD $2, EXTRACTED_128, TEMP_64
   104      ADDL TEMP_64, RESULT
   105      VPEXTRD $3, EXTRACTED_128, TEMP_64
   106      ADDL TEMP_64, RESULT
   107  #undef EXTRACTED
   108  #undef EXTRACTED_128
   109  #undef TEMP_64
   110  
   111  #define TEMP DX
   112  #define TEMP2 SI
   113  BATCH_2:
   114      CMPQ LENGTH, $2
   115      JB BATCH_1
   116      XORQ TEMP, TEMP
   117      MOVW (PDATA), TEMP
   118      MOVQ TEMP, TEMP2
   119      SHRW $8, TEMP2
   120      SHLW $8, TEMP
   121      ORW TEMP2, TEMP
   122      ADDL TEMP, RESULT
   123      ADDQ $-2, LENGTH
   124      ADDQ $2, PDATA
   125      JMP BATCH_2
   126  #undef TEMP
   127  
   128  #define TEMP DX
   129  BATCH_1:
   130      CMPQ LENGTH, $0
   131      JZ RETURN
   132      XORQ TEMP, TEMP
   133      MOVB (PDATA), TEMP
   134      SHLW $8, TEMP
   135      ADDL TEMP, RESULT
   136  #undef TEMP
   137  
   138  RETURN:
   139      MOVQ RESULT, result+16(FP)
   140      RET