github.com/MerlinKodo/sing-tun@v0.1.15/internal/clashtcpip/tcpip_arm64.s (about)

     1  #include "textflag.h"
     2  
     3  // func sumAsmNeon(data unsafe.Pointer, length uintptr) uintptr
     4  //
     5  // args (8 bytes aligned):
     6  //   data   unsafe.Pointer - 8 bytes - 0 offset
     7  //   length uintptr        - 8 bytes - 8 offset
     8  //   result uintptr        - 8 bytes - 16 offset
     9  #define PDATA  R0
    10  #define LENGTH R1
    11  #define RESULT R2
    12  #define VSUM V0
    13  TEXT ·sumAsmNeon(SB),NOSPLIT,$0-24
    14      MOVD data+0(FP), PDATA
    15      MOVD length+8(FP), LENGTH
    16      MOVD $0, RESULT
    17      VMOVQ $0, $0, VSUM
    18  
    19  #define LOADED_0 V1
    20  #define LOADED_1 V2
    21  #define LOADED_2 V3
    22  #define LOADED_3 V4
    23  BATCH_32:
    24      CMP $32, LENGTH
    25      BLO BATCH_16
    26      VLD1 (PDATA), [LOADED_0.B8, LOADED_1.B8, LOADED_2.B8, LOADED_3.B8]
    27      VREV16 LOADED_0.B8, LOADED_0.B8
    28      VREV16 LOADED_1.B8, LOADED_1.B8
    29      VREV16 LOADED_2.B8, LOADED_2.B8
    30      VREV16 LOADED_3.B8, LOADED_3.B8
    31      VUSHLL $0, LOADED_0.H4, LOADED_0.S4
    32      VUSHLL $0, LOADED_1.H4, LOADED_1.S4
    33      VUSHLL $0, LOADED_2.H4, LOADED_2.S4
    34      VUSHLL $0, LOADED_3.H4, LOADED_3.S4
    35      VADD LOADED_0.S4, VSUM.S4, VSUM.S4
    36      VADD LOADED_1.S4, VSUM.S4, VSUM.S4
    37      VADD LOADED_2.S4, VSUM.S4, VSUM.S4
    38      VADD LOADED_3.S4, VSUM.S4, VSUM.S4
    39      ADD $-32, LENGTH
    40      ADD $32, PDATA
    41      B BATCH_32
    42  #undef LOADED_0
    43  #undef LOADED_1
    44  #undef LOADED_2
    45  #undef LOADED_3
    46  
    47  #define LOADED_0 V1
    48  #define LOADED_1 V2
    49  BATCH_16:
    50      CMP $16, LENGTH
    51      BLO BATCH_8
    52      VLD1 (PDATA), [LOADED_0.B8, LOADED_1.B8]
    53      VREV16 LOADED_0.B8, LOADED_0.B8
    54      VREV16 LOADED_1.B8, LOADED_1.B8
    55      VUSHLL $0, LOADED_0.H4, LOADED_0.S4
    56      VUSHLL $0, LOADED_1.H4, LOADED_1.S4
    57      VADD LOADED_0.S4, VSUM.S4, VSUM.S4
    58      VADD LOADED_1.S4, VSUM.S4, VSUM.S4
    59      ADD $-16, LENGTH
    60      ADD $16, PDATA
    61      B BATCH_16
    62  #undef LOADED_0
    63  #undef LOADED_1
    64  
    65  #define LOADED_0 V1
    66  BATCH_8:
    67      CMP $8, LENGTH
    68      BLO BATCH_2
    69      VLD1 (PDATA), [LOADED_0.B8]
    70      VREV16 LOADED_0.B8, LOADED_0.B8
    71      VUSHLL $0, LOADED_0.H4, LOADED_0.S4
    72      VADD LOADED_0.S4, VSUM.S4, VSUM.S4
    73      ADD $-8, LENGTH
    74      ADD $8, PDATA
    75      B BATCH_8
    76  #undef LOADED_0
    77  
    78  #define LOADED_L R3
    79  #define LOADED_H R4
    80  BATCH_2:
    81      CMP $2, LENGTH
    82      BLO BATCH_1
    83      MOVBU (PDATA), LOADED_H
    84      MOVBU 1(PDATA), LOADED_L
    85      LSL $8, LOADED_H
    86      ORR LOADED_H, LOADED_L, LOADED_L
    87      ADD LOADED_L, RESULT, RESULT
    88      ADD $2, PDATA
    89      ADD $-2, LENGTH
    90      B BATCH_2
    91  #undef LOADED_H
    92  #undef LOADED_L
    93  
    94  #define LOADED R3
    95  BATCH_1:
    96      CMP $1, LENGTH
    97      BLO COLLECT
    98      MOVBU (PDATA), LOADED
    99      LSL $8, LOADED
   100      ADD LOADED, RESULT, RESULT
   101  
   102  #define EXTRACTED R3
   103  COLLECT:
   104      VMOV VSUM.S[0], EXTRACTED
   105      ADD EXTRACTED, RESULT
   106      VMOV VSUM.S[1], EXTRACTED
   107      ADD EXTRACTED, RESULT
   108      VMOV VSUM.S[2], EXTRACTED
   109      ADD EXTRACTED, RESULT
   110      VMOV VSUM.S[3], EXTRACTED
   111      ADD EXTRACTED, RESULT
   112  #undef VSUM
   113  #undef PDATA
   114  #undef LENGTH
   115  
   116  RETURN:
   117      MOVD RESULT, result+16(FP)
   118      RET