github.com/MerlinKodo/sing-tun@v0.1.15/internal/clashtcpip/tcpip_amd64.s (about) 1 #include "textflag.h" 2 3 DATA endian_swap_mask<>+0(SB)/8, $0x607040502030001 4 DATA endian_swap_mask<>+8(SB)/8, $0xE0F0C0D0A0B0809 5 DATA endian_swap_mask<>+16(SB)/8, $0x607040502030001 6 DATA endian_swap_mask<>+24(SB)/8, $0xE0F0C0D0A0B0809 7 GLOBL endian_swap_mask<>(SB), RODATA, $32 8 9 // func sumAsmAvx2(data unsafe.Pointer, length uintptr) uintptr 10 // 11 // args (8 bytes aligned): 12 // data unsafe.Pointer - 8 bytes - 0 offset 13 // length uintptr - 8 bytes - 8 offset 14 // result uintptr - 8 bytes - 16 offset 15 #define PDATA AX 16 #define LENGTH CX 17 #define RESULT BX 18 TEXT ·sumAsmAvx2(SB),NOSPLIT,$0-24 19 MOVQ data+0(FP), PDATA 20 MOVQ length+8(FP), LENGTH 21 XORQ RESULT, RESULT 22 23 #define VSUM Y0 24 #define ENDIAN_SWAP_MASK Y1 25 BEGIN: 26 VMOVDQU endian_swap_mask<>(SB), ENDIAN_SWAP_MASK 27 VPXOR VSUM, VSUM, VSUM 28 29 #define LOADED_0 Y2 30 #define LOADED_1 Y3 31 #define LOADED_2 Y4 32 #define LOADED_3 Y5 33 BATCH_64: 34 CMPQ LENGTH, $64 35 JB BATCH_32 36 VPMOVZXWD (PDATA), LOADED_0 37 VPMOVZXWD 16(PDATA), LOADED_1 38 VPMOVZXWD 32(PDATA), LOADED_2 39 VPMOVZXWD 48(PDATA), LOADED_3 40 VPSHUFB ENDIAN_SWAP_MASK, LOADED_0, LOADED_0 41 VPSHUFB ENDIAN_SWAP_MASK, LOADED_1, LOADED_1 42 VPSHUFB ENDIAN_SWAP_MASK, LOADED_2, LOADED_2 43 VPSHUFB ENDIAN_SWAP_MASK, LOADED_3, LOADED_3 44 VPADDD LOADED_0, VSUM, VSUM 45 VPADDD LOADED_1, VSUM, VSUM 46 VPADDD LOADED_2, VSUM, VSUM 47 VPADDD LOADED_3, VSUM, VSUM 48 ADDQ $-64, LENGTH 49 ADDQ $64, PDATA 50 JMP BATCH_64 51 #undef LOADED_0 52 #undef LOADED_1 53 #undef LOADED_2 54 #undef LOADED_3 55 56 #define LOADED_0 Y2 57 #define LOADED_1 Y3 58 BATCH_32: 59 CMPQ LENGTH, $32 60 JB BATCH_16 61 VPMOVZXWD (PDATA), LOADED_0 62 VPMOVZXWD 16(PDATA), LOADED_1 63 VPSHUFB ENDIAN_SWAP_MASK, LOADED_0, LOADED_0 64 VPSHUFB ENDIAN_SWAP_MASK, LOADED_1, LOADED_1 65 VPADDD LOADED_0, VSUM, VSUM 66 VPADDD LOADED_1, VSUM, VSUM 67 ADDQ $-32, LENGTH 68 ADDQ $32, PDATA 69 JMP BATCH_32 70 #undef LOADED_0 71 #undef LOADED_1 72 73 #define LOADED Y2 74 BATCH_16: 75 CMPQ LENGTH, $16 76 JB COLLECT 77 VPMOVZXWD (PDATA), LOADED 78 VPSHUFB ENDIAN_SWAP_MASK, LOADED, LOADED 79 VPADDD LOADED, VSUM, VSUM 80 ADDQ $-16, LENGTH 81 ADDQ $16, PDATA 82 JMP BATCH_16 83 #undef LOADED 84 85 #define EXTRACTED Y2 86 #define EXTRACTED_128 X2 87 #define TEMP_64 DX 88 COLLECT: 89 VEXTRACTI128 $0, VSUM, EXTRACTED_128 90 VPEXTRD $0, EXTRACTED_128, TEMP_64 91 ADDL TEMP_64, RESULT 92 VPEXTRD $1, EXTRACTED_128, TEMP_64 93 ADDL TEMP_64, RESULT 94 VPEXTRD $2, EXTRACTED_128, TEMP_64 95 ADDL TEMP_64, RESULT 96 VPEXTRD $3, EXTRACTED_128, TEMP_64 97 ADDL TEMP_64, RESULT 98 VEXTRACTI128 $1, VSUM, EXTRACTED_128 99 VPEXTRD $0, EXTRACTED_128, TEMP_64 100 ADDL TEMP_64, RESULT 101 VPEXTRD $1, EXTRACTED_128, TEMP_64 102 ADDL TEMP_64, RESULT 103 VPEXTRD $2, EXTRACTED_128, TEMP_64 104 ADDL TEMP_64, RESULT 105 VPEXTRD $3, EXTRACTED_128, TEMP_64 106 ADDL TEMP_64, RESULT 107 #undef EXTRACTED 108 #undef EXTRACTED_128 109 #undef TEMP_64 110 111 #define TEMP DX 112 #define TEMP2 SI 113 BATCH_2: 114 CMPQ LENGTH, $2 115 JB BATCH_1 116 XORQ TEMP, TEMP 117 MOVW (PDATA), TEMP 118 MOVQ TEMP, TEMP2 119 SHRW $8, TEMP2 120 SHLW $8, TEMP 121 ORW TEMP2, TEMP 122 ADDL TEMP, RESULT 123 ADDQ $-2, LENGTH 124 ADDQ $2, PDATA 125 JMP BATCH_2 126 #undef TEMP 127 128 #define TEMP DX 129 BATCH_1: 130 CMPQ LENGTH, $0 131 JZ RETURN 132 XORQ TEMP, TEMP 133 MOVB (PDATA), TEMP 134 SHLW $8, TEMP 135 ADDL TEMP, RESULT 136 #undef TEMP 137 138 RETURN: 139 MOVQ RESULT, result+16(FP) 140 RET