git.sr.ht/~pingoo/stdx@v0.0.0-20240218134121-094174641f6e/crypto/chacha/macro.s (about) 1 // Copyright (c) 2018 Andreas Auernhammer. All rights reserved. 2 // Use of this source code is governed by a license that can be 3 // found in the LICENSE file. 4 5 // +build 386,!gccgo,!appengine,!nacl amd64,!gccgo,!appengine,!nacl 6 7 // ROTL_SSE rotates all 4 32 bit values of the XMM register v 8 // left by n bits using SSE2 instructions (0 <= n <= 32). 9 // The XMM register t is used as a temp. register. 10 #define ROTL_SSE(n, t, v) \ 11 MOVO v, t; \ 12 PSLLL $n, t; \ 13 PSRLL $(32-n), v; \ 14 PXOR t, v 15 16 // ROTL_AVX rotates all 4/8 32 bit values of the AVX/AVX2 register v 17 // left by n bits using AVX/AVX2 instructions (0 <= n <= 32). 18 // The AVX/AVX2 register t is used as a temp. register. 19 #define ROTL_AVX(n, t, v) \ 20 VPSLLD $n, v, t; \ 21 VPSRLD $(32-n), v, v; \ 22 VPXOR v, t, v 23 24 // CHACHA_QROUND_SSE2 performs a ChaCha quarter-round using the 25 // 4 XMM registers v0, v1, v2 and v3. It uses only ROTL_SSE2 for 26 // rotations. The XMM register t is used as a temp. register. 27 #define CHACHA_QROUND_SSE2(v0, v1, v2, v3, t) \ 28 PADDL v1, v0; \ 29 PXOR v0, v3; \ 30 ROTL_SSE(16, t, v3); \ 31 PADDL v3, v2; \ 32 PXOR v2, v1; \ 33 ROTL_SSE(12, t, v1); \ 34 PADDL v1, v0; \ 35 PXOR v0, v3; \ 36 ROTL_SSE(8, t, v3); \ 37 PADDL v3, v2; \ 38 PXOR v2, v1; \ 39 ROTL_SSE(7, t, v1) 40 41 // CHACHA_QROUND_SSSE3 performs a ChaCha quarter-round using the 42 // 4 XMM registers v0, v1, v2 and v3. It uses PSHUFB for 8/16 bit 43 // rotations. The XMM register t is used as a temp. register. 44 // 45 // r16 holds the PSHUFB constant for a 16 bit left rotate. 46 // r8 holds the PSHUFB constant for a 8 bit left rotate. 47 #define CHACHA_QROUND_SSSE3(v0, v1, v2, v3, t, r16, r8) \ 48 PADDL v1, v0; \ 49 PXOR v0, v3; \ 50 PSHUFB r16, v3; \ 51 PADDL v3, v2; \ 52 PXOR v2, v1; \ 53 ROTL_SSE(12, t, v1); \ 54 PADDL v1, v0; \ 55 PXOR v0, v3; \ 56 PSHUFB r8, v3; \ 57 PADDL v3, v2; \ 58 PXOR v2, v1; \ 59 ROTL_SSE(7, t, v1) 60 61 // CHACHA_QROUND_AVX performs a ChaCha quarter-round using the 62 // 4 AVX/AVX2 registers v0, v1, v2 and v3. It uses VPSHUFB for 8/16 bit 63 // rotations. The AVX/AVX2 register t is used as a temp. register. 64 // 65 // r16 holds the VPSHUFB constant for a 16 bit left rotate. 66 // r8 holds the VPSHUFB constant for a 8 bit left rotate. 67 #define CHACHA_QROUND_AVX(v0, v1, v2, v3, t, r16, r8) \ 68 VPADDD v0, v1, v0; \ 69 VPXOR v3, v0, v3; \ 70 VPSHUFB r16, v3, v3; \ 71 VPADDD v2, v3, v2; \ 72 VPXOR v1, v2, v1; \ 73 ROTL_AVX(12, t, v1); \ 74 VPADDD v0, v1, v0; \ 75 VPXOR v3, v0, v3; \ 76 VPSHUFB r8, v3, v3; \ 77 VPADDD v2, v3, v2; \ 78 VPXOR v1, v2, v1; \ 79 ROTL_AVX(7, t, v1) 80 81 // CHACHA_SHUFFLE_SSE performs a ChaCha shuffle using the 82 // 3 XMM registers v1, v2 and v3. The inverse shuffle is 83 // performed by switching v1 and v3: CHACHA_SHUFFLE_SSE(v3, v2, v1). 84 #define CHACHA_SHUFFLE_SSE(v1, v2, v3) \ 85 PSHUFL $0x39, v1, v1; \ 86 PSHUFL $0x4E, v2, v2; \ 87 PSHUFL $0x93, v3, v3 88 89 // CHACHA_SHUFFLE_AVX performs a ChaCha shuffle using the 90 // 3 AVX/AVX2 registers v1, v2 and v3. The inverse shuffle is 91 // performed by switching v1 and v3: CHACHA_SHUFFLE_AVX(v3, v2, v1). 92 #define CHACHA_SHUFFLE_AVX(v1, v2, v3) \ 93 VPSHUFD $0x39, v1, v1; \ 94 VPSHUFD $0x4E, v2, v2; \ 95 VPSHUFD $0x93, v3, v3 96 97 // XOR_SSE extracts 4x16 byte vectors from src at 98 // off, xors all vectors with the corresponding XMM 99 // register (v0 - v3) and writes the result to dst 100 // at off. 101 // The XMM register t is used as a temp. register. 102 #define XOR_SSE(dst, src, off, v0, v1, v2, v3, t) \ 103 MOVOU 0+off(src), t; \ 104 PXOR v0, t; \ 105 MOVOU t, 0+off(dst); \ 106 MOVOU 16+off(src), t; \ 107 PXOR v1, t; \ 108 MOVOU t, 16+off(dst); \ 109 MOVOU 32+off(src), t; \ 110 PXOR v2, t; \ 111 MOVOU t, 32+off(dst); \ 112 MOVOU 48+off(src), t; \ 113 PXOR v3, t; \ 114 MOVOU t, 48+off(dst) 115 116 // XOR_AVX extracts 4x16 byte vectors from src at 117 // off, xors all vectors with the corresponding AVX 118 // register (v0 - v3) and writes the result to dst 119 // at off. 120 // The XMM register t is used as a temp. register. 121 #define XOR_AVX(dst, src, off, v0, v1, v2, v3, t) \ 122 VPXOR 0+off(src), v0, t; \ 123 VMOVDQU t, 0+off(dst); \ 124 VPXOR 16+off(src), v1, t; \ 125 VMOVDQU t, 16+off(dst); \ 126 VPXOR 32+off(src), v2, t; \ 127 VMOVDQU t, 32+off(dst); \ 128 VPXOR 48+off(src), v3, t; \ 129 VMOVDQU t, 48+off(dst) 130 131 #define XOR_AVX2(dst, src, off, v0, v1, v2, v3, t0, t1) \ 132 VMOVDQU (0+off)(src), t0; \ 133 VPERM2I128 $32, v1, v0, t1; \ 134 VPXOR t0, t1, t0; \ 135 VMOVDQU t0, (0+off)(dst); \ 136 VMOVDQU (32+off)(src), t0; \ 137 VPERM2I128 $32, v3, v2, t1; \ 138 VPXOR t0, t1, t0; \ 139 VMOVDQU t0, (32+off)(dst); \ 140 VMOVDQU (64+off)(src), t0; \ 141 VPERM2I128 $49, v1, v0, t1; \ 142 VPXOR t0, t1, t0; \ 143 VMOVDQU t0, (64+off)(dst); \ 144 VMOVDQU (96+off)(src), t0; \ 145 VPERM2I128 $49, v3, v2, t1; \ 146 VPXOR t0, t1, t0; \ 147 VMOVDQU t0, (96+off)(dst) 148 149 #define XOR_UPPER_AVX2(dst, src, off, v0, v1, v2, v3, t0, t1) \ 150 VMOVDQU (0+off)(src), t0; \ 151 VPERM2I128 $32, v1, v0, t1; \ 152 VPXOR t0, t1, t0; \ 153 VMOVDQU t0, (0+off)(dst); \ 154 VMOVDQU (32+off)(src), t0; \ 155 VPERM2I128 $32, v3, v2, t1; \ 156 VPXOR t0, t1, t0; \ 157 VMOVDQU t0, (32+off)(dst); \ 158 159 #define EXTRACT_LOWER(dst, v0, v1, v2, v3, t0) \ 160 VPERM2I128 $49, v1, v0, t0; \ 161 VMOVDQU t0, 0(dst); \ 162 VPERM2I128 $49, v3, v2, t0; \ 163 VMOVDQU t0, 32(dst)