github.com/go-asm/go@v1.21.1-0.20240213172139-40c5ead50c48/chacha8rand/chacha8_arm64.s (about)

     1  // Copyright 2023 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "textflag.h"
     6  
     7  // QR is the ChaCha quarter-round on A, B, C, and D.
     8  // V30 is used as a temporary, and V31 is assumed to
     9  // hold the index table for rotate left 8.
    10  #define QR(A, B, C, D) \
    11  	VADD A.S4, B.S4, A.S4; VEOR D.B16, A.B16, D.B16;   VREV32 D.H8, D.H8; \
    12  	VADD C.S4, D.S4, C.S4; VEOR B.B16, C.B16, V30.B16; VSHL $12, V30.S4, B.S4; VSRI $20, V30.S4, B.S4 \
    13  	VADD A.S4, B.S4, A.S4; VEOR D.B16, A.B16, D.B16;   VTBL V31.B16, [D.B16], D.B16; \
    14  	VADD C.S4, D.S4, C.S4; VEOR B.B16, C.B16, V30.B16; VSHL  $7, V30.S4, B.S4; VSRI $25, V30.S4, B.S4
    15  
    16  // block runs 4 ChaCha8 block transformations in the four stripes of the V registers.
    17  
    18  // func block(seed *[8]uint32, blocks *[4][16]uint32, counter uint32)
    19  TEXT ·block<ABIInternal>(SB), NOSPLIT, $16
    20  	// seed in R0
    21  	// blocks in R1
    22  	// counter in R2
    23  
    24  	// Load initial constants into top row.
    25  	MOVD $·chachaConst(SB), R10
    26  	VLD4R (R10), [V0.S4, V1.S4, V2.S4, V3.S4]
    27  
    28  	// Load increment and rotate 8 constants into V30, V31.
    29  	MOVD $·chachaIncRot(SB), R11
    30  	VLD1 (R11), [V30.S4, V31.S4]
    31  
    32  	VLD4R.P 16(R0), [V4.S4, V5.S4, V6.S4, V7.S4]
    33  	VLD4R.P 16(R0), [V8.S4, V9.S4, V10.S4, V11.S4]
    34  
    35  	// store counter to memory to replicate its uint32 halfs back out
    36  	MOVW R2, 0(RSP)
    37  	VLD1R 0(RSP), [V12.S4]
    38  
    39  	// Add 0, 1, 2, 3 to counter stripes.
    40  	VADD V30.S4, V12.S4, V12.S4
    41  
    42  	// Zeros for remaining two matrix entries.
    43  	VEOR V13.B16, V13.B16, V13.B16
    44  	VEOR V14.B16, V14.B16, V14.B16
    45  	VEOR V15.B16, V15.B16, V15.B16
    46  
    47  	// Save seed state for adding back later.
    48  	VMOV V4.B16, V20.B16
    49  	VMOV V5.B16, V21.B16
    50  	VMOV V6.B16, V22.B16
    51  	VMOV V7.B16, V23.B16
    52  	VMOV V8.B16, V24.B16
    53  	VMOV V9.B16, V25.B16
    54  	VMOV V10.B16, V26.B16
    55  	VMOV V11.B16, V27.B16
    56  
    57  	// 4 iterations. Each iteration is 8 quarter-rounds.
    58  	MOVD $4, R0
    59  loop:
    60  	QR(V0, V4, V8, V12)
    61  	QR(V1, V5, V9, V13)
    62  	QR(V2, V6, V10, V14)
    63  	QR(V3, V7, V11, V15)
    64  
    65  	QR(V0, V5, V10, V15)
    66  	QR(V1, V6, V11, V12)
    67  	QR(V2, V7, V8, V13)
    68  	QR(V3, V4, V9, V14)
    69  
    70  	SUB $1, R0
    71  	CBNZ R0, loop
    72  
    73  	// Add seed back.
    74  	VADD V4.S4, V20.S4, V4.S4
    75  	VADD V5.S4, V21.S4, V5.S4
    76  	VADD V6.S4, V22.S4, V6.S4
    77  	VADD V7.S4, V23.S4, V7.S4
    78  	VADD V8.S4, V24.S4, V8.S4
    79  	VADD V9.S4, V25.S4, V9.S4
    80  	VADD V10.S4, V26.S4, V10.S4
    81  	VADD V11.S4, V27.S4, V11.S4
    82  
    83  	// Store interlaced blocks back to output buffer.
    84  	VST1.P [ V0.B16,  V1.B16,  V2.B16,  V3.B16], 64(R1)
    85  	VST1.P [ V4.B16,  V5.B16,  V6.B16,  V7.B16], 64(R1)
    86  	VST1.P [ V8.B16,  V9.B16, V10.B16, V11.B16], 64(R1)
    87  	VST1.P [V12.B16, V13.B16, V14.B16, V15.B16], 64(R1)
    88  	RET
    89  
    90  GLOBL	·chachaConst(SB), NOPTR|RODATA, $32
    91  DATA	·chachaConst+0x00(SB)/4, $0x61707865
    92  DATA	·chachaConst+0x04(SB)/4, $0x3320646e
    93  DATA	·chachaConst+0x08(SB)/4, $0x79622d32
    94  DATA	·chachaConst+0x0c(SB)/4, $0x6b206574
    95  
    96  GLOBL	·chachaIncRot(SB), NOPTR|RODATA, $32
    97  DATA	·chachaIncRot+0x00(SB)/4, $0x00000000
    98  DATA	·chachaIncRot+0x04(SB)/4, $0x00000001
    99  DATA	·chachaIncRot+0x08(SB)/4, $0x00000002
   100  DATA	·chachaIncRot+0x0c(SB)/4, $0x00000003
   101  DATA	·chachaIncRot+0x10(SB)/4, $0x02010003
   102  DATA	·chachaIncRot+0x14(SB)/4, $0x06050407
   103  DATA	·chachaIncRot+0x18(SB)/4, $0x0A09080B
   104  DATA	·chachaIncRot+0x1c(SB)/4, $0x0E0D0C0F