github.com/cloudflare/circl@v1.5.0/simd/keccakf1600/f1600x2_arm64.s (about) 1 // +build arm64,go1.16,!purego 2 3 // Taken from https://github.com/bwesterb/armed-keccak 4 5 #include "textflag.h" 6 7 // func f1600x2ARM(state *uint64, rc *[24]uint64, turbo bool) 8 TEXT ·f1600x2ARM(SB), NOSPLIT, $0-17 9 MOVD state+0(FP), R0 10 MOVD rc+8(FP), R1 11 MOVD R0, R2 12 MOVD $24, R3 13 14 VLD1.P 64(R0), [ V0.B16, V1.B16, V2.B16, V3.B16] 15 VLD1.P 64(R0), [ V4.B16, V5.B16, V6.B16, V7.B16] 16 VLD1.P 64(R0), [ V8.B16, V9.B16, V10.B16, V11.B16] 17 VLD1.P 64(R0), [V12.B16, V13.B16, V14.B16, V15.B16] 18 VLD1.P 64(R0), [V16.B16, V17.B16, V18.B16, V19.B16] 19 VLD1.P 64(R0), [V20.B16, V21.B16, V22.B16, V23.B16] 20 VLD1.P (R0), [V24.B16] 21 22 MOVBU turbo+16(FP), R4 23 CBZ R4, loop 24 25 SUB $12, R3, R3 26 ADD $96, R1, R1 27 28 loop: 29 // Execute theta but without xorring into the state yet. 30 VEOR3 V10.B16, V5.B16, V0.B16, V25.B16 31 VEOR3 V11.B16, V6.B16, V1.B16, V26.B16 32 VEOR3 V12.B16, V7.B16, V2.B16, V27.B16 33 VEOR3 V13.B16, V8.B16, V3.B16, V28.B16 34 VEOR3 V14.B16, V9.B16, V4.B16, V29.B16 35 36 VEOR3 V20.B16, V15.B16, V25.B16, V25.B16 37 VEOR3 V21.B16, V16.B16, V26.B16, V26.B16 38 VEOR3 V22.B16, V17.B16, V27.B16, V27.B16 39 VEOR3 V23.B16, V18.B16, V28.B16, V28.B16 40 VEOR3 V24.B16, V19.B16, V29.B16, V29.B16 41 42 // Xor parities from step theta into the state at the same time as 43 // exeuting rho and pi. 44 VRAX1 V26.D2, V29.D2, V30.D2 45 VRAX1 V29.D2, V27.D2, V29.D2 46 VRAX1 V27.D2, V25.D2, V27.D2 47 VRAX1 V25.D2, V28.D2, V25.D2 48 VRAX1 V28.D2, V26.D2, V28.D2 49 50 VEOR V30.B16, V0.B16, V0.B16 51 VMOV V1.B16, V31.B16 52 53 VXAR $20, V27.D2, V6.D2, V1.D2 54 VXAR $44, V25.D2, V9.D2, V6.D2 55 VXAR $3 , V28.D2, V22.D2, V9.D2 56 VXAR $25, V25.D2, V14.D2, V22.D2 57 VXAR $46, V30.D2, V20.D2, V14.D2 58 VXAR $2 , V28.D2, V2.D2, V20.D2 59 VXAR $21, V28.D2, V12.D2, V2.D2 60 VXAR $39, V29.D2, V13.D2, V12.D2 61 VXAR $56, V25.D2, V19.D2, V13.D2 62 VXAR $8 , V29.D2, V23.D2, V19.D2 63 VXAR $23, V30.D2, V15.D2, V23.D2 64 VXAR $37, V25.D2, V4.D2, V15.D2 65 VXAR $50, V25.D2, V24.D2, V4.D2 66 VXAR $62, V27.D2, V21.D2, V24.D2 67 VXAR $9 , V29.D2, V8.D2, V21.D2 68 VXAR $19, V27.D2, V16.D2, V8.D2 69 VXAR $28, V30.D2, V5.D2, V16.D2 70 VXAR $36, V29.D2, V3.D2, V5.D2 71 VXAR $43, V29.D2, V18.D2, V3.D2 72 VXAR $49, V28.D2, V17.D2, V18.D2 73 VXAR $54, V27.D2, V11.D2, V17.D2 74 VXAR $58, V28.D2, V7.D2, V11.D2 75 VXAR $61, V30.D2, V10.D2, V7.D2 76 VXAR $63, V27.D2, V31.D2, V10.D2 77 78 // Chi 79 VBCAX V1.B16, V2.B16, V0.B16, V25.B16 80 VBCAX V2.B16, V3.B16, V1.B16, V26.B16 81 VBCAX V3.B16, V4.B16, V2.B16, V2.B16 82 VBCAX V4.B16, V0.B16, V3.B16, V3.B16 83 VBCAX V0.B16, V1.B16, V4.B16, V4.B16 84 VMOV V25.B16, V0.B16 85 VMOV V26.B16, V1.B16 86 87 VBCAX V6.B16, V7.B16, V5.B16, V25.B16 88 VBCAX V7.B16, V8.B16, V6.B16, V26.B16 89 VBCAX V8.B16, V9.B16, V7.B16, V7.B16 90 VBCAX V9.B16, V5.B16, V8.B16, V8.B16 91 VBCAX V5.B16, V6.B16, V9.B16, V9.B16 92 VMOV V25.B16, V5.B16 93 VMOV V26.B16, V6.B16 94 95 VBCAX V11.B16, V12.B16, V10.B16, V25.B16 96 VBCAX V12.B16, V13.B16, V11.B16, V26.B16 97 VBCAX V13.B16, V14.B16, V12.B16, V12.B16 98 VBCAX V14.B16, V10.B16, V13.B16, V13.B16 99 VBCAX V10.B16, V11.B16, V14.B16, V14.B16 100 VMOV V25.B16, V10.B16 101 VMOV V26.B16, V11.B16 102 103 VBCAX V16.B16, V17.B16, V15.B16, V25.B16 104 VBCAX V17.B16, V18.B16, V16.B16, V26.B16 105 VBCAX V18.B16, V19.B16, V17.B16, V17.B16 106 VBCAX V19.B16, V15.B16, V18.B16, V18.B16 107 VBCAX V15.B16, V16.B16, V19.B16, V19.B16 108 VMOV V25.B16, V15.B16 109 VMOV V26.B16, V16.B16 110 111 VBCAX V21.B16, V22.B16, V20.B16, V25.B16 112 VBCAX V22.B16, V23.B16, V21.B16, V26.B16 113 VBCAX V23.B16, V24.B16, V22.B16, V22.B16 114 VBCAX V24.B16, V20.B16, V23.B16, V23.B16 115 VBCAX V20.B16, V21.B16, V24.B16, V24.B16 116 VMOV V25.B16, V20.B16 117 VMOV V26.B16, V21.B16 118 119 // Iota 120 VLD1R.P 8(R1), [V25.D2] 121 VEOR V25.B16, V0.B16, V0.B16 122 123 SUBS $1, R3, R3 124 CBNZ R3, loop 125 126 MOVD R2, R0 127 128 VST1.P [ V0.B16, V1.B16, V2.B16, V3.B16], 64(R0) 129 VST1.P [ V4.B16, V5.B16, V6.B16, V7.B16], 64(R0) 130 VST1.P [ V8.B16, V9.B16, V10.B16, V11.B16], 64(R0) 131 VST1.P [V12.B16, V13.B16, V14.B16, V15.B16], 64(R0) 132 VST1.P [V16.B16, V17.B16, V18.B16, V19.B16], 64(R0) 133 VST1.P [V20.B16, V21.B16, V22.B16, V23.B16], 64(R0) 134 VST1.P [V24.B16], (R0) 135 136 RET