github.com/cloudflare/circl@v1.5.0/simd/keccakf1600/f1600x2_arm64.s (about)

     1  // +build arm64,go1.16,!purego
     2  
     3  // Taken from https://github.com/bwesterb/armed-keccak
     4  
     5  #include "textflag.h"
     6  
     7  // func f1600x2ARM(state *uint64, rc *[24]uint64, turbo bool)
     8  TEXT ·f1600x2ARM(SB), NOSPLIT, $0-17
     9      MOVD state+0(FP), R0
    10      MOVD rc+8(FP), R1
    11      MOVD R0, R2
    12      MOVD $24, R3
    13  
    14      VLD1.P 64(R0), [ V0.B16,  V1.B16,  V2.B16,  V3.B16]
    15      VLD1.P 64(R0), [ V4.B16,  V5.B16,  V6.B16,  V7.B16]
    16      VLD1.P 64(R0), [ V8.B16,  V9.B16, V10.B16, V11.B16]
    17      VLD1.P 64(R0), [V12.B16, V13.B16, V14.B16, V15.B16]
    18      VLD1.P 64(R0), [V16.B16, V17.B16, V18.B16, V19.B16]
    19      VLD1.P 64(R0), [V20.B16, V21.B16, V22.B16, V23.B16]
    20      VLD1.P (R0),   [V24.B16]
    21  
    22      MOVBU turbo+16(FP), R4
    23      CBZ R4, loop
    24  
    25      SUB  $12, R3, R3
    26      ADD  $96, R1, R1
    27  
    28  loop:
    29      // Execute theta but without xorring into the state yet.
    30      VEOR3 V10.B16, V5.B16, V0.B16, V25.B16
    31      VEOR3 V11.B16, V6.B16, V1.B16, V26.B16
    32      VEOR3 V12.B16, V7.B16, V2.B16, V27.B16
    33      VEOR3 V13.B16, V8.B16, V3.B16, V28.B16
    34      VEOR3 V14.B16, V9.B16, V4.B16, V29.B16
    35  
    36      VEOR3 V20.B16, V15.B16, V25.B16, V25.B16
    37      VEOR3 V21.B16, V16.B16, V26.B16, V26.B16
    38      VEOR3 V22.B16, V17.B16, V27.B16, V27.B16
    39      VEOR3 V23.B16, V18.B16, V28.B16, V28.B16
    40      VEOR3 V24.B16, V19.B16, V29.B16, V29.B16
    41  
    42      // Xor parities from step theta into the state at the same time as
    43      // exeuting rho and pi.   
    44      VRAX1 V26.D2, V29.D2, V30.D2
    45      VRAX1 V29.D2, V27.D2, V29.D2
    46      VRAX1 V27.D2, V25.D2, V27.D2
    47      VRAX1 V25.D2, V28.D2, V25.D2
    48      VRAX1 V28.D2, V26.D2, V28.D2
    49  
    50      VEOR V30.B16, V0.B16, V0.B16
    51      VMOV V1.B16, V31.B16
    52  
    53      VXAR $20, V27.D2,  V6.D2,  V1.D2   
    54      VXAR $44, V25.D2,  V9.D2,  V6.D2   
    55      VXAR $3 , V28.D2, V22.D2,  V9.D2   
    56      VXAR $25, V25.D2, V14.D2, V22.D2  
    57      VXAR $46, V30.D2, V20.D2, V14.D2  
    58      VXAR $2 , V28.D2,  V2.D2, V20.D2  
    59      VXAR $21, V28.D2, V12.D2,  V2.D2  
    60      VXAR $39, V29.D2, V13.D2, V12.D2  
    61      VXAR $56, V25.D2, V19.D2, V13.D2  
    62      VXAR $8 , V29.D2, V23.D2, V19.D2  
    63      VXAR $23, V30.D2, V15.D2, V23.D2  
    64      VXAR $37, V25.D2,  V4.D2, V15.D2  
    65      VXAR $50, V25.D2, V24.D2,  V4.D2   
    66      VXAR $62, V27.D2, V21.D2, V24.D2  
    67      VXAR $9 , V29.D2,  V8.D2, V21.D2  
    68      VXAR $19, V27.D2, V16.D2,  V8.D2   
    69      VXAR $28, V30.D2,  V5.D2, V16.D2  
    70      VXAR $36, V29.D2,  V3.D2,  V5.D2   
    71      VXAR $43, V29.D2, V18.D2,  V3.D2   
    72      VXAR $49, V28.D2, V17.D2, V18.D2  
    73      VXAR $54, V27.D2, V11.D2, V17.D2  
    74      VXAR $58, V28.D2,  V7.D2, V11.D2  
    75      VXAR $61, V30.D2, V10.D2,  V7.D2   
    76      VXAR $63, V27.D2, V31.D2, V10.D2  
    77  
    78      // Chi
    79      VBCAX V1.B16, V2.B16, V0.B16, V25.B16
    80      VBCAX V2.B16, V3.B16, V1.B16, V26.B16
    81      VBCAX V3.B16, V4.B16, V2.B16,  V2.B16
    82      VBCAX V4.B16, V0.B16, V3.B16,  V3.B16
    83      VBCAX V0.B16, V1.B16, V4.B16,  V4.B16
    84      VMOV V25.B16, V0.B16
    85      VMOV V26.B16, V1.B16
    86  
    87      VBCAX V6.B16, V7.B16, V5.B16, V25.B16
    88      VBCAX V7.B16, V8.B16, V6.B16, V26.B16
    89      VBCAX V8.B16, V9.B16, V7.B16,  V7.B16
    90      VBCAX V9.B16, V5.B16, V8.B16,  V8.B16
    91      VBCAX V5.B16, V6.B16, V9.B16,  V9.B16
    92      VMOV V25.B16, V5.B16
    93      VMOV V26.B16, V6.B16
    94  
    95      VBCAX V11.B16, V12.B16, V10.B16, V25.B16
    96      VBCAX V12.B16, V13.B16, V11.B16, V26.B16
    97      VBCAX V13.B16, V14.B16, V12.B16, V12.B16
    98      VBCAX V14.B16, V10.B16, V13.B16, V13.B16
    99      VBCAX V10.B16, V11.B16, V14.B16, V14.B16
   100      VMOV V25.B16, V10.B16
   101      VMOV V26.B16, V11.B16
   102  
   103      VBCAX V16.B16, V17.B16, V15.B16, V25.B16
   104      VBCAX V17.B16, V18.B16, V16.B16, V26.B16
   105      VBCAX V18.B16, V19.B16, V17.B16, V17.B16
   106      VBCAX V19.B16, V15.B16, V18.B16, V18.B16
   107      VBCAX V15.B16, V16.B16, V19.B16, V19.B16
   108      VMOV V25.B16, V15.B16
   109      VMOV V26.B16, V16.B16
   110  
   111      VBCAX V21.B16, V22.B16, V20.B16, V25.B16
   112      VBCAX V22.B16, V23.B16, V21.B16, V26.B16
   113      VBCAX V23.B16, V24.B16, V22.B16, V22.B16
   114      VBCAX V24.B16, V20.B16, V23.B16, V23.B16
   115      VBCAX V20.B16, V21.B16, V24.B16, V24.B16
   116      VMOV V25.B16, V20.B16
   117      VMOV V26.B16, V21.B16
   118  
   119      // Iota
   120      VLD1R.P 8(R1), [V25.D2]
   121      VEOR V25.B16, V0.B16, V0.B16
   122  
   123      SUBS $1, R3, R3
   124      CBNZ R3, loop
   125  
   126      MOVD R2, R0
   127  
   128      VST1.P [ V0.B16,  V1.B16,  V2.B16,  V3.B16], 64(R0) 
   129      VST1.P [ V4.B16,  V5.B16,  V6.B16,  V7.B16], 64(R0)
   130      VST1.P [ V8.B16,  V9.B16, V10.B16, V11.B16], 64(R0)
   131      VST1.P [V12.B16, V13.B16, V14.B16, V15.B16], 64(R0)
   132      VST1.P [V16.B16, V17.B16, V18.B16, V19.B16], 64(R0)
   133      VST1.P [V20.B16, V21.B16, V22.B16, V23.B16], 64(R0)
   134      VST1.P [V24.B16], (R0)
   135  
   136      RET