github.com/mad-day/Yawning-crypto@v0.0.0-20190711051033-5a5f8cca32ec/morus/hwaccel_amd64.s (about) 1 // +build !noasm,go1.10 2 // hwaccel_amd64.s - AMD64 optimized routines 3 // 4 // To the extent possible under law, Yawning Angel has waived all copyright 5 // and related or neighboring rights to the software, using the Creative 6 // Commons "CC0" public domain dedication. See LICENSE or 7 // <http://creativecommons.org/publicdomain/zero/1.0/> for full details. 8 9 #include "textflag.h" 10 11 // func cpuidAmd64(cpuidParams *uint32) 12 TEXT ·cpuidAmd64(SB), NOSPLIT, $0-8 13 MOVQ cpuidParams+0(FP), R15 14 MOVL 0(R15), AX 15 MOVL 8(R15), CX 16 CPUID 17 MOVL AX, 0(R15) 18 MOVL BX, 4(R15) 19 MOVL CX, 8(R15) 20 MOVL DX, 12(R15) 21 RET 22 23 // func xgetbv0Amd64(xcrVec *uint32) 24 TEXT ·xgetbv0Amd64(SB), NOSPLIT, $0-8 25 MOVQ xcrVec+0(FP), BX 26 XORL CX, CX 27 XGETBV 28 MOVL AX, 0(BX) 29 MOVL DX, 4(BX) 30 RET 31 32 // Some useful macros for loading/storing the state, and the state update 33 // function, along with aliases for the registers used for readability. 34 35 // YMM Registers: Sx -> State, Mx -> Message, Tx -> Temporary 36 // GP Registers: RAX, RBX, RCX -> Temporary 37 #define S0 Y0 38 #define S1 Y1 39 #define S2 Y2 40 #define S3 Y3 41 #define S4 Y4 42 #define M0 Y5 43 #define T0 Y14 44 #define T1 Y15 45 46 // This essentially naively translated from the intrinsics, but neither GCC nor 47 // clang's idea of what this should be appears to be better on Broadwell, and 48 // there is a benefit to being easy to cross reference with the upstream 49 // implementation. 50 #define STATE_UPDATE() \ 51 VPXOR S0, S3, S0 \ 52 VPAND S1, S2, T0 \ 53 VPXOR S0, T0, S0 \ 54 VPSLLQ $13, S0, T0 \ 55 VPSRLQ $51, S0, T1 \ 56 VPOR T0, T1, S0 \ 57 VPERMQ $-109, S3, S3 \ 58 \ 59 VPXOR S1, M0, S1 \ 60 VPXOR S1, S4, S1 \ 61 VPAND S2, S3, T0 \ 62 VPXOR S1, T0, S1 \ 63 VPSLLQ $46, S1, T0 \ 64 VPSRLQ $18, S1, T1 \ 65 VPOR T0, T1, S1 \ 66 VPERMQ $78, S4, S4 \ 67 \ 68 VPXOR S2, M0, S2 \ 69 VPXOR S2, S0, S2 \ 70 VPAND S3, S4, T0 \ 71 VPXOR S2, T0, S2 \ 72 VPSLLQ $38, S2, T0 \ 73 VPSRLQ $26, S2, T1 \ 74 VPOR T0, T1, S2 \ 75 VPERMQ $57, S0, S0 \ 76 \ 77 VPXOR S3, M0, S3 \ 78 VPXOR S3, S1, S3 \ 79 VPAND S4, S0, T0 \ 80 VPXOR S3, T0, S3 \ 81 VPSLLQ $7, S3, T0 \ 82 VPSRLQ $57, S3, T1 \ 83 VPOR T0, T1, S3 \ 84 VPERMQ $78, S1, S1 \ 85 \ 86 VPXOR S4, M0, S4 \ 87 VPXOR S4, S2, S4 \ 88 VPAND S0, S1, T0 \ 89 VPXOR S4, T0, S4 \ 90 VPSLLQ $4, S4, T0 \ 91 VPSRLQ $60, S4, T1 \ 92 VPOR T0, T1, S4 \ 93 VPERMQ $-109, S2, S2 94 95 #define COPY(DST, SRC, LEN) \ 96 MOVQ SRC, SI \ 97 MOVQ DST, DI \ 98 MOVQ LEN, CX \ 99 REP \ 100 MOVSB 101 102 #define INIT_STATE(IV, KEY) \ 103 VPXOR S0, S0, S0 \ 104 MOVOU (IV), X0 \ 105 VMOVDQU (KEY), S1 \ 106 VPCMPEQD S2, S2, S2 \ 107 VPXOR S3, S3, S3 \ 108 VMOVDQU ·initializationConstants(SB), S4 \ 109 VPXOR M0, M0, M0 \ 110 VMOVDQA S1, Y6 \ 111 MOVQ $16, AX \ 112 \ 113 initLoop: \ 114 STATE_UPDATE() \ 115 SUBQ $1, AX \ 116 JNZ initLoop \ 117 \ 118 VPXOR Y6, S1, S1 119 120 #define ABSORB_BLOCKS(A, ALEN, SCRATCH) \ 121 MOVQ ALEN, AX \ 122 SHRQ $5, AX \ 123 JZ absorbPartial \ 124 loopAbsorbFull: \ 125 VMOVDQU (A), M0 \ 126 STATE_UPDATE() \ 127 ADDQ $32, A \ 128 SUBQ $1, AX \ 129 JNZ loopAbsorbFull \ 130 absorbPartial: \ 131 ANDQ $31, ALEN \ 132 JZ absorbDone \ 133 COPY(SCRATCH, A, ALEN) \ 134 VMOVDQU (SCRATCH), M0 \ 135 STATE_UPDATE() \ 136 absorbDone: 137 138 #define FINALIZE(TAG, ALEN, MLEN, SCRATCH) \ 139 SHLQ $3, ALEN \ 140 MOVQ ALEN, (SCRATCH) \ 141 SHLQ $3, MLEN \ 142 MOVQ MLEN, 8(SCRATCH) \ 143 \ 144 VPXOR S4, S0, S4 \ 145 VMOVDQU (SCRATCH), M0 \ 146 \ 147 MOVQ $10, AX \ 148 loopFinal: \ 149 STATE_UPDATE() \ 150 SUBQ $1, AX \ 151 JNZ loopFinal \ 152 \ 153 VPERMQ $57, S1, Y6 \ 154 VPXOR S0, Y6, Y6 \ 155 VPAND S2, S3, Y7 \ 156 VPXOR Y6, Y7, Y7 \ 157 MOVOU X7, (TAG) 158 159 // func aeadEncryptAVX2(c, m, a []byte, nonce, key *byte) 160 TEXT ·aeadEncryptAVX2(SB), NOSPLIT, $32-88 161 MOVQ SP, R15 162 VPXOR Y13, Y13, Y13 163 VMOVDQU Y13, (R15) 164 CLD 165 166 // Initialize the state. 167 MOVQ nonce+72(FP), R8 168 MOVQ key+80(FP), R9 169 INIT_STATE(R8, R9) 170 171 // Absorb the AD. 172 MOVQ a+48(FP), R8 // &a[0] -> R8 173 MOVQ a+56(FP), R9 // len(a) -> R9 174 ABSORB_BLOCKS(R8, R9, R15) 175 176 // Encrypt the data. 177 MOVQ m+24(FP), R8 // &m[0] -> R8 178 MOVQ m+32(FP), R9 // len(m) -> R9 179 MOVQ c+0(FP), R10 // &c[0] -> R10 180 181 MOVQ R9, AX 182 SHRQ $5, AX 183 JZ encryptPartial 184 185 loopEncryptFull: 186 VMOVDQU (R8), M0 187 VPERMQ $57, S1, Y6 188 VPXOR S0, Y6, Y6 189 VPAND S2, S3, Y7 190 VPXOR Y6, Y7, Y6 191 VPXOR M0, Y6, Y6 192 VMOVDQU Y6, (R10) 193 STATE_UPDATE() 194 ADDQ $32, R8 195 ADDQ $32, R10 196 SUBQ $1, AX 197 JNZ loopEncryptFull 198 199 encryptPartial: 200 ANDQ $31, R9 201 JZ encryptDone 202 VMOVDQU Y13, (R15) 203 COPY(R15, R8, R9) 204 VMOVDQU (R15), M0 205 VPERMQ $57, S1, Y6 206 VPXOR S0, Y6, Y6 207 VPAND S2, S3, Y7 208 VPXOR Y6, Y7, Y6 209 VPXOR M0, Y6, Y6 210 VMOVDQU Y6, (R15) 211 STATE_UPDATE() 212 COPY(R10, R15, R9) 213 ADDQ R9, R10 214 215 encryptDone: 216 217 // Finalize and write the tag. 218 MOVQ a+56(FP), R8 // len(a) -> R8 219 MOVQ m+32(FP), R9 // len(m) -> R9 220 VMOVDQU Y13, (R15) 221 FINALIZE(R10, R8, R9, R15) 222 223 VMOVDQU Y13, (R15) 224 VZEROUPPER 225 RET 226 227 // func aeadDecryptAVX2(m, c, a []byte, nonce, key, tag *byte) 228 TEXT ·aeadDecryptAVX2(SB), NOSPLIT, $32-96 229 MOVQ SP, R15 230 VPXOR Y13, Y13, Y13 231 VMOVDQU Y13, (R15) 232 CLD 233 234 // Initialize the state. 235 MOVQ nonce+72(FP), R8 236 MOVQ key+80(FP), R9 237 INIT_STATE(R8, R9) 238 239 // Absorb the AD. 240 MOVQ a+48(FP), R8 // &a[0] -> R8 241 MOVQ a+56(FP), R9 // len(a) -> R9 242 ABSORB_BLOCKS(R8, R9, R15) 243 244 // Decrypt the data. 245 MOVQ c+24(FP), R8 // &c[0] -> R8 246 MOVQ c+32(FP), R9 // len(c) -> R9 247 MOVQ m+0(FP), R10 // &m[0] -> R10 248 249 MOVQ R9, AX 250 SHRQ $5, AX 251 JZ decryptPartial 252 253 loopDecryptFull: 254 VMOVDQU (R8), M0 255 VPERMQ $57, S1, Y6 256 VPXOR S0, Y6, Y6 257 VPAND S2, S3, Y7 258 VPXOR Y6, Y7, Y6 259 VPXOR M0, Y6, M0 260 VMOVDQU M0, (R10) 261 STATE_UPDATE() 262 ADDQ $32, R8 263 ADDQ $32, R10 264 SUBQ $1, AX 265 JNZ loopDecryptFull 266 267 decryptPartial: 268 ANDQ $31, R9 269 JZ decryptDone 270 VMOVDQU Y13, (R15) 271 COPY(R15, R8, R9) 272 VMOVDQU (R15), M0 273 VPERMQ $57, S1, Y6 274 VPXOR S0, Y6, Y6 275 VPAND S2, S3, Y7 276 VPXOR Y6, Y7, Y6 277 VPXOR M0, Y6, M0 278 VMOVDQU M0, (R15) 279 COPY(R10, R15, R9) 280 MOVQ $0, AX 281 MOVQ R15, DI 282 MOVQ $32, CX 283 SUBQ R9, CX 284 ADDQ R9, DI 285 REP 286 STOSB 287 VMOVDQU (R15), M0 288 STATE_UPDATE() 289 290 decryptDone: 291 292 // Finalize and write the tag. 293 MOVQ a+56(FP), R8 // len(a) -> R8 294 MOVQ m+32(FP), R9 // len(m) -> R9 295 MOVQ tag+88(FP), R14 // tag -> R14 296 VMOVDQU Y13, (R15) 297 FINALIZE(R14, R8, R9, R15) 298 299 VMOVDQU Y13, (R15) 300 VZEROUPPER 301 RET