github.com/go-asm/go@v1.21.1-0.20240213172139-40c5ead50c48/chacha8rand/chacha8_amd64.s (about)

     1  // Copyright 2023 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "textflag.h"
     6  
     7  // ChaCha8 is ChaCha with 8 rounds.
     8  // See https://cr.yp.to/chacha/chacha-20080128.pdf.
     9  // See chacha8_generic.go for additional details.
    10  
    11  // ROL rotates the uint32s in register R left by N bits, using temporary T.
    12  #define ROL(N, R, T) \
    13  	MOVO R, T; PSLLL $(N), T; PSRLL $(32-(N)), R; PXOR T, R
    14  
    15  // ROL16 rotates the uint32s in register R left by 16, using temporary T if needed.
    16  #ifdef GOAMD64_v2
    17  #define ROL16(R, T) PSHUFB ·rol16<>(SB), R
    18  #else
    19  #define ROL16(R, T) ROL(16, R, T)
    20  #endif
    21  
    22  // ROL8 rotates the uint32s in register R left by 8, using temporary T if needed.
    23  #ifdef GOAMD64_v2
    24  #define ROL8(R, T) PSHUFB ·rol8<>(SB), R
    25  #else
    26  #define ROL8(R, T) ROL(8, R, T)
    27  #endif
    28  
    29  // QR is the ChaCha quarter-round on A, B, C, and D. T is an available temporary.
    30  #define QR(A, B, C, D, T) \
    31  	PADDD B, A; PXOR A, D; ROL16(D, T); \
    32  	PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B; \
    33  	PADDD B, A; PXOR A, D; ROL8(D, T); \
    34  	PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B
    35  
    36  // REPLREG replicates the register R into 4 uint32s in XR.
    37  #define REPLREG(R, XR) \
    38  	MOVQ R, XR; \
    39  	PSHUFD $0, XR, XR
    40  
    41  // REPL replicates the uint32 constant val into 4 uint32s in XR. It smashes DX.
    42  #define REPL(val, XR) \
    43  	MOVL $val, DX; \
    44  	REPLREG(DX, XR)
    45  
    46  // SEED copies the off'th uint32 of the seed into the register XR,
    47  // replicating it into all four stripes of the register.
    48  #define SEED(off, reg, XR) \
    49  	MOVL (4*off)(AX), reg; \
    50  	REPLREG(reg, XR) \
    51  
    52  // block runs 4 ChaCha8 block transformations in the four stripes of the X registers.
    53  
    54  // func block(seed *[8]uint32, blocks *[16][4]uint32, counter uint32)
    55  TEXT ·block(SB), NOSPLIT, $16
    56  	// seed in AX
    57  	// blocks in BX
    58  	// counter in CX
    59  
    60  	// Load initial constants into top row.
    61  	REPL(0x61707865, X0)
    62  	REPL(0x3320646e, X1)
    63  	REPL(0x79622d32, X2)
    64  	REPL(0x6b206574, X3)
    65  
    66  	// Load counter into bottom left cell.
    67  	// Each stripe gets a different counter: 0, 1, 2, 3.
    68  	// (PINSRD is not available in GOAMD64_v1,
    69  	// so just do it in memory on all systems.
    70  	// This is not on the critical path.)
    71  	MOVL CX, 0(SP)
    72  	INCL CX
    73  	MOVL CX, 4(SP)
    74  	INCL CX
    75  	MOVL CX, 8(SP)
    76  	INCL CX
    77  	MOVL CX, 12(SP)
    78  	MOVOU 0(SP), X12
    79  
    80  	// Load seed words into next two rows and into DI, SI, R8..R13
    81  	SEED(0, DI, X4)
    82  	SEED(1, SI, X5)
    83  	SEED(2, R8, X6)
    84  	SEED(3, R9, X7)
    85  	SEED(4, R10, X8)
    86  	SEED(5, R11, X9)
    87  	SEED(6, R12, X10)
    88  	SEED(7, R13, X11)
    89  
    90  	// Zeros for remaining two matrix entries.
    91  	// We have just enough XMM registers to hold the state,
    92  	// without one for the temporary, so we flush and restore
    93  	// some values to and from memory to provide a temporary.
    94  	// The initial temporary is X15, so zero its memory instead
    95  	// of X15 itself.
    96  	MOVL $0, DX
    97  	MOVQ DX, X13
    98  	MOVQ DX, X14
    99  	MOVOU X14, (15*16)(BX)
   100  
   101  	// 4 iterations. Each iteration is 8 quarter-rounds.
   102  	MOVL $4, DX
   103  loop:
   104  	QR(X0, X4, X8, X12, X15)
   105  	MOVOU X4, (4*16)(BX) // save X4
   106  	QR(X1, X5, X9, X13, X15)
   107  	MOVOU (15*16)(BX), X15 // reload X15; temp now X4
   108  	QR(X2, X6, X10, X14, X4)
   109  	QR(X3, X7, X11, X15, X4)
   110  
   111  	QR(X0, X5, X10, X15, X4)
   112  	MOVOU X15, (15*16)(BX) // save X15
   113  	QR(X1, X6, X11, X12, X4)
   114  	MOVOU (4*16)(BX), X4  // reload X4; temp now X15
   115  	QR(X2, X7, X8, X13, X15)
   116  	QR(X3, X4, X9, X14, X15)
   117  
   118  	DECL DX
   119  	JNZ loop
   120  
   121  	// Store interlaced blocks back to output buffer,
   122  	// adding original seed along the way.
   123  
   124  	// First the top and bottom rows.
   125  	MOVOU X0, (0*16)(BX)
   126  	MOVOU X1, (1*16)(BX)
   127  	MOVOU X2, (2*16)(BX)
   128  	MOVOU X3, (3*16)(BX)
   129  	MOVOU X12, (12*16)(BX)
   130  	MOVOU X13, (13*16)(BX)
   131  	MOVOU X14, (14*16)(BX)
   132  	// X15 has already been stored.
   133  
   134  	// Now we have X0-X3, X12-X15 available for temporaries.
   135  	// Add seed rows back to output. We left seed in DI, SI, R8..R13 above.
   136  	REPLREG(DI, X0)
   137  	REPLREG(SI, X1)
   138  	REPLREG(R8, X2)
   139  	REPLREG(R9, X3)
   140  	REPLREG(R10, X12)
   141  	REPLREG(R11, X13)
   142  	REPLREG(R12, X14)
   143  	REPLREG(R13, X15)
   144  	PADDD X0, X4
   145  	PADDD X1, X5
   146  	PADDD X2, X6
   147  	PADDD X3, X7
   148  	PADDD X12, X8
   149  	PADDD X13, X9
   150  	PADDD X14, X10
   151  	PADDD X15, X11
   152  	MOVOU X4, (4*16)(BX)
   153  	MOVOU X5, (5*16)(BX)
   154  	MOVOU X6, (6*16)(BX)
   155  	MOVOU X7, (7*16)(BX)
   156  	MOVOU X8, (8*16)(BX)
   157  	MOVOU X9, (9*16)(BX)
   158  	MOVOU X10, (10*16)(BX)
   159  	MOVOU X11, (11*16)(BX)
   160  
   161  	MOVL $0, AX
   162  	MOVQ AX, X15 // must be 0 on return
   163  
   164  	RET
   165  
   166  // rotate left 16 indexes for PSHUFB
   167  GLOBL ·rol16<>(SB), NOPTR|RODATA, $16
   168  DATA ·rol16<>+0(SB)/8, $0x0504070601000302
   169  DATA ·rol16<>+8(SB)/8, $0x0D0C0F0E09080B0A
   170  
   171  // rotate left 8 indexes for PSHUFB
   172  GLOBL ·rol8<>(SB), NOPTR|RODATA, $16
   173  DATA ·rol8<>+0(SB)/8, $0x0605040702010003
   174  DATA ·rol8<>+8(SB)/8, $0x0E0D0C0F0A09080B