github.com/bir3/gocompiler@v0.9.2202/src/internal/chacha8rand/chacha8_arm64.s (about) 1 // Copyright 2023 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "textflag.h" 6 7 // QR is the ChaCha quarter-round on A, B, C, and D. 8 // V30 is used as a temporary, and V31 is assumed to 9 // hold the index table for rotate left 8. 10 #define QR(A, B, C, D) \ 11 VADD A.S4, B.S4, A.S4; VEOR D.B16, A.B16, D.B16; VREV32 D.H8, D.H8; \ 12 VADD C.S4, D.S4, C.S4; VEOR B.B16, C.B16, V30.B16; VSHL $12, V30.S4, B.S4; VSRI $20, V30.S4, B.S4 \ 13 VADD A.S4, B.S4, A.S4; VEOR D.B16, A.B16, D.B16; VTBL V31.B16, [D.B16], D.B16; \ 14 VADD C.S4, D.S4, C.S4; VEOR B.B16, C.B16, V30.B16; VSHL $7, V30.S4, B.S4; VSRI $25, V30.S4, B.S4 15 16 // block runs 4 ChaCha8 block transformations in the four stripes of the V registers. 17 18 // func block(seed *[8]uint32, blocks *[4][16]uint32, counter uint32) 19 TEXT ·block<ABIInternal>(SB), NOSPLIT, $16 20 // seed in R0 21 // blocks in R1 22 // counter in R2 23 24 // Load initial constants into top row. 25 MOVD $·chachaConst(SB), R10 26 VLD4R (R10), [V0.S4, V1.S4, V2.S4, V3.S4] 27 28 // Load increment and rotate 8 constants into V30, V31. 29 MOVD $·chachaIncRot(SB), R11 30 VLD1 (R11), [V30.S4, V31.S4] 31 32 VLD4R.P 16(R0), [V4.S4, V5.S4, V6.S4, V7.S4] 33 VLD4R.P 16(R0), [V8.S4, V9.S4, V10.S4, V11.S4] 34 35 // store counter to memory to replicate its uint32 halfs back out 36 MOVW R2, 0(RSP) 37 VLD1R 0(RSP), [V12.S4] 38 39 // Add 0, 1, 2, 3 to counter stripes. 40 VADD V30.S4, V12.S4, V12.S4 41 42 // Zeros for remaining two matrix entries. 43 VEOR V13.B16, V13.B16, V13.B16 44 VEOR V14.B16, V14.B16, V14.B16 45 VEOR V15.B16, V15.B16, V15.B16 46 47 // Save seed state for adding back later. 48 VMOV V4.B16, V20.B16 49 VMOV V5.B16, V21.B16 50 VMOV V6.B16, V22.B16 51 VMOV V7.B16, V23.B16 52 VMOV V8.B16, V24.B16 53 VMOV V9.B16, V25.B16 54 VMOV V10.B16, V26.B16 55 VMOV V11.B16, V27.B16 56 57 // 4 iterations. Each iteration is 8 quarter-rounds. 58 MOVD $4, R0 59 loop: 60 QR(V0, V4, V8, V12) 61 QR(V1, V5, V9, V13) 62 QR(V2, V6, V10, V14) 63 QR(V3, V7, V11, V15) 64 65 QR(V0, V5, V10, V15) 66 QR(V1, V6, V11, V12) 67 QR(V2, V7, V8, V13) 68 QR(V3, V4, V9, V14) 69 70 SUB $1, R0 71 CBNZ R0, loop 72 73 // Add seed back. 74 VADD V4.S4, V20.S4, V4.S4 75 VADD V5.S4, V21.S4, V5.S4 76 VADD V6.S4, V22.S4, V6.S4 77 VADD V7.S4, V23.S4, V7.S4 78 VADD V8.S4, V24.S4, V8.S4 79 VADD V9.S4, V25.S4, V9.S4 80 VADD V10.S4, V26.S4, V10.S4 81 VADD V11.S4, V27.S4, V11.S4 82 83 // Store interlaced blocks back to output buffer. 84 VST1.P [ V0.B16, V1.B16, V2.B16, V3.B16], 64(R1) 85 VST1.P [ V4.B16, V5.B16, V6.B16, V7.B16], 64(R1) 86 VST1.P [ V8.B16, V9.B16, V10.B16, V11.B16], 64(R1) 87 VST1.P [V12.B16, V13.B16, V14.B16, V15.B16], 64(R1) 88 RET 89 90 GLOBL ·chachaConst(SB), NOPTR|RODATA, $32 91 DATA ·chachaConst+0x00(SB)/4, $0x61707865 92 DATA ·chachaConst+0x04(SB)/4, $0x3320646e 93 DATA ·chachaConst+0x08(SB)/4, $0x79622d32 94 DATA ·chachaConst+0x0c(SB)/4, $0x6b206574 95 96 GLOBL ·chachaIncRot(SB), NOPTR|RODATA, $32 97 DATA ·chachaIncRot+0x00(SB)/4, $0x00000000 98 DATA ·chachaIncRot+0x04(SB)/4, $0x00000001 99 DATA ·chachaIncRot+0x08(SB)/4, $0x00000002 100 DATA ·chachaIncRot+0x0c(SB)/4, $0x00000003 101 DATA ·chachaIncRot+0x10(SB)/4, $0x02010003 102 DATA ·chachaIncRot+0x14(SB)/4, $0x06050407 103 DATA ·chachaIncRot+0x18(SB)/4, $0x0A09080B 104 DATA ·chachaIncRot+0x1c(SB)/4, $0x0E0D0C0F