github.com/MerlinKodo/sing-tun@v0.1.15/internal/clashtcpip/tcpip_arm64.s (about) 1 #include "textflag.h" 2 3 // func sumAsmNeon(data unsafe.Pointer, length uintptr) uintptr 4 // 5 // args (8 bytes aligned): 6 // data unsafe.Pointer - 8 bytes - 0 offset 7 // length uintptr - 8 bytes - 8 offset 8 // result uintptr - 8 bytes - 16 offset 9 #define PDATA R0 10 #define LENGTH R1 11 #define RESULT R2 12 #define VSUM V0 13 TEXT ·sumAsmNeon(SB),NOSPLIT,$0-24 14 MOVD data+0(FP), PDATA 15 MOVD length+8(FP), LENGTH 16 MOVD $0, RESULT 17 VMOVQ $0, $0, VSUM 18 19 #define LOADED_0 V1 20 #define LOADED_1 V2 21 #define LOADED_2 V3 22 #define LOADED_3 V4 23 BATCH_32: 24 CMP $32, LENGTH 25 BLO BATCH_16 26 VLD1 (PDATA), [LOADED_0.B8, LOADED_1.B8, LOADED_2.B8, LOADED_3.B8] 27 VREV16 LOADED_0.B8, LOADED_0.B8 28 VREV16 LOADED_1.B8, LOADED_1.B8 29 VREV16 LOADED_2.B8, LOADED_2.B8 30 VREV16 LOADED_3.B8, LOADED_3.B8 31 VUSHLL $0, LOADED_0.H4, LOADED_0.S4 32 VUSHLL $0, LOADED_1.H4, LOADED_1.S4 33 VUSHLL $0, LOADED_2.H4, LOADED_2.S4 34 VUSHLL $0, LOADED_3.H4, LOADED_3.S4 35 VADD LOADED_0.S4, VSUM.S4, VSUM.S4 36 VADD LOADED_1.S4, VSUM.S4, VSUM.S4 37 VADD LOADED_2.S4, VSUM.S4, VSUM.S4 38 VADD LOADED_3.S4, VSUM.S4, VSUM.S4 39 ADD $-32, LENGTH 40 ADD $32, PDATA 41 B BATCH_32 42 #undef LOADED_0 43 #undef LOADED_1 44 #undef LOADED_2 45 #undef LOADED_3 46 47 #define LOADED_0 V1 48 #define LOADED_1 V2 49 BATCH_16: 50 CMP $16, LENGTH 51 BLO BATCH_8 52 VLD1 (PDATA), [LOADED_0.B8, LOADED_1.B8] 53 VREV16 LOADED_0.B8, LOADED_0.B8 54 VREV16 LOADED_1.B8, LOADED_1.B8 55 VUSHLL $0, LOADED_0.H4, LOADED_0.S4 56 VUSHLL $0, LOADED_1.H4, LOADED_1.S4 57 VADD LOADED_0.S4, VSUM.S4, VSUM.S4 58 VADD LOADED_1.S4, VSUM.S4, VSUM.S4 59 ADD $-16, LENGTH 60 ADD $16, PDATA 61 B BATCH_16 62 #undef LOADED_0 63 #undef LOADED_1 64 65 #define LOADED_0 V1 66 BATCH_8: 67 CMP $8, LENGTH 68 BLO BATCH_2 69 VLD1 (PDATA), [LOADED_0.B8] 70 VREV16 LOADED_0.B8, LOADED_0.B8 71 VUSHLL $0, LOADED_0.H4, LOADED_0.S4 72 VADD LOADED_0.S4, VSUM.S4, VSUM.S4 73 ADD $-8, LENGTH 74 ADD $8, PDATA 75 B BATCH_8 76 #undef LOADED_0 77 78 #define LOADED_L R3 79 #define LOADED_H R4 80 BATCH_2: 81 CMP $2, LENGTH 82 BLO BATCH_1 83 MOVBU (PDATA), LOADED_H 84 MOVBU 1(PDATA), LOADED_L 85 LSL $8, LOADED_H 86 ORR LOADED_H, LOADED_L, LOADED_L 87 ADD LOADED_L, RESULT, RESULT 88 ADD $2, PDATA 89 ADD $-2, LENGTH 90 B BATCH_2 91 #undef LOADED_H 92 #undef LOADED_L 93 94 #define LOADED R3 95 BATCH_1: 96 CMP $1, LENGTH 97 BLO COLLECT 98 MOVBU (PDATA), LOADED 99 LSL $8, LOADED 100 ADD LOADED, RESULT, RESULT 101 102 #define EXTRACTED R3 103 COLLECT: 104 VMOV VSUM.S[0], EXTRACTED 105 ADD EXTRACTED, RESULT 106 VMOV VSUM.S[1], EXTRACTED 107 ADD EXTRACTED, RESULT 108 VMOV VSUM.S[2], EXTRACTED 109 ADD EXTRACTED, RESULT 110 VMOV VSUM.S[3], EXTRACTED 111 ADD EXTRACTED, RESULT 112 #undef VSUM 113 #undef PDATA 114 #undef LENGTH 115 116 RETURN: 117 MOVD RESULT, result+16(FP) 118 RET