github.com/bir3/gocompiler@v0.9.2202/src/internal/chacha8rand/chacha8_amd64.s (about) 1 // Copyright 2023 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "textflag.h" 6 7 // ChaCha8 is ChaCha with 8 rounds. 8 // See https://cr.yp.to/chacha/chacha-20080128.pdf. 9 // See chacha8_generic.go for additional details. 10 11 // ROL rotates the uint32s in register R left by N bits, using temporary T. 12 #define ROL(N, R, T) \ 13 MOVO R, T; PSLLL $(N), T; PSRLL $(32-(N)), R; PXOR T, R 14 15 // ROL16 rotates the uint32s in register R left by 16, using temporary T if needed. 16 #ifdef GOAMD64_v2 17 #define ROL16(R, T) PSHUFB ·rol16<>(SB), R 18 #else 19 #define ROL16(R, T) ROL(16, R, T) 20 #endif 21 22 // ROL8 rotates the uint32s in register R left by 8, using temporary T if needed. 23 #ifdef GOAMD64_v2 24 #define ROL8(R, T) PSHUFB ·rol8<>(SB), R 25 #else 26 #define ROL8(R, T) ROL(8, R, T) 27 #endif 28 29 // QR is the ChaCha quarter-round on A, B, C, and D. T is an available temporary. 30 #define QR(A, B, C, D, T) \ 31 PADDD B, A; PXOR A, D; ROL16(D, T); \ 32 PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B; \ 33 PADDD B, A; PXOR A, D; ROL8(D, T); \ 34 PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B 35 36 // REPLREG replicates the register R into 4 uint32s in XR. 37 #define REPLREG(R, XR) \ 38 MOVQ R, XR; \ 39 PSHUFD $0, XR, XR 40 41 // REPL replicates the uint32 constant val into 4 uint32s in XR. It smashes DX. 42 #define REPL(val, XR) \ 43 MOVL $val, DX; \ 44 REPLREG(DX, XR) 45 46 // SEED copies the off'th uint32 of the seed into the register XR, 47 // replicating it into all four stripes of the register. 48 #define SEED(off, reg, XR) \ 49 MOVL (4*off)(AX), reg; \ 50 REPLREG(reg, XR) \ 51 52 // block runs 4 ChaCha8 block transformations in the four stripes of the X registers. 53 54 // func block(seed *[8]uint32, blocks *[16][4]uint32, counter uint32) 55 TEXT ·block<ABIInternal>(SB), NOSPLIT, $16 56 // seed in AX 57 // blocks in BX 58 // counter in CX 59 60 // Load initial constants into top row. 61 REPL(0x61707865, X0) 62 REPL(0x3320646e, X1) 63 REPL(0x79622d32, X2) 64 REPL(0x6b206574, X3) 65 66 // Load counter into bottom left cell. 67 // Each stripe gets a different counter: 0, 1, 2, 3. 68 // (PINSRD is not available in GOAMD64_v1, 69 // so just do it in memory on all systems. 70 // This is not on the critical path.) 71 MOVL CX, 0(SP) 72 INCL CX 73 MOVL CX, 4(SP) 74 INCL CX 75 MOVL CX, 8(SP) 76 INCL CX 77 MOVL CX, 12(SP) 78 MOVOU 0(SP), X12 79 80 // Load seed words into next two rows and into DI, SI, R8..R13 81 SEED(0, DI, X4) 82 SEED(1, SI, X5) 83 SEED(2, R8, X6) 84 SEED(3, R9, X7) 85 SEED(4, R10, X8) 86 SEED(5, R11, X9) 87 SEED(6, R12, X10) 88 SEED(7, R13, X11) 89 90 // Zeros for remaining two matrix entries. 91 // We have just enough XMM registers to hold the state, 92 // without one for the temporary, so we flush and restore 93 // some values to and from memory to provide a temporary. 94 // The initial temporary is X15, so zero its memory instead 95 // of X15 itself. 96 MOVL $0, DX 97 MOVQ DX, X13 98 MOVQ DX, X14 99 MOVOU X14, (15*16)(BX) 100 101 // 4 iterations. Each iteration is 8 quarter-rounds. 102 MOVL $4, DX 103 loop: 104 QR(X0, X4, X8, X12, X15) 105 MOVOU X4, (4*16)(BX) // save X4 106 QR(X1, X5, X9, X13, X15) 107 MOVOU (15*16)(BX), X15 // reload X15; temp now X4 108 QR(X2, X6, X10, X14, X4) 109 QR(X3, X7, X11, X15, X4) 110 111 QR(X0, X5, X10, X15, X4) 112 MOVOU X15, (15*16)(BX) // save X15 113 QR(X1, X6, X11, X12, X4) 114 MOVOU (4*16)(BX), X4 // reload X4; temp now X15 115 QR(X2, X7, X8, X13, X15) 116 QR(X3, X4, X9, X14, X15) 117 118 DECL DX 119 JNZ loop 120 121 // Store interlaced blocks back to output buffer, 122 // adding original seed along the way. 123 124 // First the top and bottom rows. 125 MOVOU X0, (0*16)(BX) 126 MOVOU X1, (1*16)(BX) 127 MOVOU X2, (2*16)(BX) 128 MOVOU X3, (3*16)(BX) 129 MOVOU X12, (12*16)(BX) 130 MOVOU X13, (13*16)(BX) 131 MOVOU X14, (14*16)(BX) 132 // X15 has already been stored. 133 134 // Now we have X0-X3, X12-X15 available for temporaries. 135 // Add seed rows back to output. We left seed in DI, SI, R8..R13 above. 136 REPLREG(DI, X0) 137 REPLREG(SI, X1) 138 REPLREG(R8, X2) 139 REPLREG(R9, X3) 140 REPLREG(R10, X12) 141 REPLREG(R11, X13) 142 REPLREG(R12, X14) 143 REPLREG(R13, X15) 144 PADDD X0, X4 145 PADDD X1, X5 146 PADDD X2, X6 147 PADDD X3, X7 148 PADDD X12, X8 149 PADDD X13, X9 150 PADDD X14, X10 151 PADDD X15, X11 152 MOVOU X4, (4*16)(BX) 153 MOVOU X5, (5*16)(BX) 154 MOVOU X6, (6*16)(BX) 155 MOVOU X7, (7*16)(BX) 156 MOVOU X8, (8*16)(BX) 157 MOVOU X9, (9*16)(BX) 158 MOVOU X10, (10*16)(BX) 159 MOVOU X11, (11*16)(BX) 160 161 MOVL $0, AX 162 MOVQ AX, X15 // must be 0 on return 163 164 RET 165 166 // rotate left 16 indexes for PSHUFB 167 GLOBL ·rol16<>(SB), NOPTR|RODATA, $16 168 DATA ·rol16<>+0(SB)/8, $0x0504070601000302 169 DATA ·rol16<>+8(SB)/8, $0x0D0C0F0E09080B0A 170 171 // rotate left 8 indexes for PSHUFB 172 GLOBL ·rol8<>(SB), NOPTR|RODATA, $16 173 DATA ·rol8<>+0(SB)/8, $0x0605040702010003 174 DATA ·rol8<>+8(SB)/8, $0x0E0D0C0F0A09080B