github.com/emmansun/gmsm@v0.29.1/sm3/sm3blocks_simd_amd64.s (about) 1 // Copyright 2024 Sun Yimin. All rights reserved. 2 // Use of this source code is governed by a MIT-style 3 // license that can be found in the LICENSE file. 4 5 //go:build !purego 6 7 #include "textflag.h" 8 9 // shuffle byte order from LE to BE 10 DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203 11 DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b 12 GLOBL flip_mask<>(SB), RODATA, $16 13 14 // left rotations of 32-bit words by 8-bit increments 15 DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003 16 DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B 17 GLOBL r08_mask<>(SB), 8, $16 18 19 // Transpose matrix with PUNPCKHDQ/PUNPCKLDQ/PUNPCKHQDQ/PUNPCKLQDQ instructions. 20 // input: from high to low 21 // r0 = [w3, w2, w1, w0] 22 // r1 = [w7, w6, w5, w4] 23 // r2 = [w11, w10, w9, w8] 24 // r3 = [w15, w14, w13, w12] 25 // r: 32/64 temp register 26 // tmp1: 128 bits temp register 27 // tmp2: 128 bits temp register 28 // 29 // output: from high to low 30 // r0 = [w12, w8, w4, w0] 31 // r1 = [w13, w9, w5, w1] 32 // r2 = [w14, w10, w6, w2] 33 // r3 = [w15, w11, w7, w3] 34 // 35 // SSE2/MMX instructions: 36 // MOVOU r0, tmp2; 37 // PUNPCKHDQ r1, tmp2; 38 // PUNPCKLDQ r1, r0; 39 // MOVOU r2, tmp1; 40 // PUNPCKLDQ r3, tmp1; 41 // PUNPCKHDQ r3, r2; 42 // MOVOU r0, r1; 43 // PUNPCKHQDQ tmp1, r1; 44 // PUNPCKLQDQ tmp1, r0; 45 // MOVOU tmp2, r3; 46 // PUNPCKHQDQ r2, r3; 47 // PUNPCKLQDQ r2, tmp2; 48 // MOVOU tmp2, r2 49 #define SSE_TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \ 50 MOVOU r0, tmp2; \ 51 PUNPCKHLQ r1, tmp2; \ 52 PUNPCKLLQ r1, r0; \ 53 MOVOU r2, tmp1; \ 54 PUNPCKLLQ r3, tmp1; \ 55 PUNPCKHLQ r3, r2; \ 56 MOVOU r0, r1; \ 57 PUNPCKHQDQ tmp1, r1; \ 58 PUNPCKLQDQ tmp1, r0; \ 59 MOVOU tmp2, r3; \ 60 PUNPCKHQDQ r2, r3; \ 61 PUNPCKLQDQ r2, tmp2; \ 62 MOVOU tmp2, r2 63 64 #define a X0 65 #define b X1 66 #define c X2 67 #define d X3 68 #define e X4 69 #define f X5 70 #define g X6 71 #define h X7 72 73 #define tmp1 X8 74 #define tmp2 X9 75 76 #define storeState(R) \ 77 MOVOU a, (R) \ 78 MOVOU b, 16(R) \ 79 MOVOU c, 32(R) \ 80 MOVOU d, 48(R) \ 81 MOVOU e, 64(R) \ 82 MOVOU f, 80(R) \ 83 MOVOU g, 96(R) \ 84 MOVOU h, 112(R) 85 86 #define storeWord(W, j) MOVOU W, (128+(j)*16)(BX) 87 #define loadWord(W, i) MOVOU (128+(i)*16)(BX), W 88 89 #define SSE_REV32(a, b, c, d) \ 90 PSHUFB flip_mask<>(SB), a; \ 91 PSHUFB flip_mask<>(SB), b; \ 92 PSHUFB flip_mask<>(SB), c; \ 93 PSHUFB flip_mask<>(SB), d 94 95 #define prepare4Words(i) \ 96 MOVOU (i*16)(R8), X10; \ 97 MOVOU (i*16)(R9), X11; \ 98 MOVOU (i*16)(R10), X12; \ 99 MOVOU (i*16)(R11), X13; \ 100 ; \ 101 SSE_TRANSPOSE_MATRIX(X10, X11, X12, X13, tmp1, tmp2); \ 102 SSE_REV32(X10, X11, X12, X13); \ 103 ; \ 104 storeWord(X10, 4*i+0); \ 105 storeWord(X11, 4*i+1); \ 106 storeWord(X12, 4*i+2); \ 107 storeWord(X13, 4*i+3) 108 109 #define LOAD_T(index, T) \ 110 MOVL (index*4)(AX), T; \ 111 PSHUFD $0, T, T 112 113 // r <<< n, SSE version 114 #define PROLD(r, n) \ 115 MOVOU r, tmp1; \ 116 PSLLL $n, r; \ 117 PSRLL $(32-n), tmp1; \ 118 POR tmp1, r 119 120 #define SSE_SS1SS2(index, a, e, TMP, SS1, SS2) \ 121 MOVOU a, SS1; \ 122 PROLD(SS1, 12); \ 123 MOVOU SS1, SS2; \ // a <<< 12 124 LOAD_T(index, TMP); \ 125 PADDL TMP, SS1; \ 126 PADDL e, SS1; \ 127 PROLD(SS1, 7); \ // SS1 128 PXOR SS1, SS2; \ // SS2 129 130 #define SSE_FF0(X, Y, Z, DST) \ 131 MOVOU X, DST; \ 132 PXOR Y, DST; \ 133 PXOR Z, DST 134 135 #define SSE_FF1(X, Y, Z, TMP, DST) \ 136 MOVOU X, DST; \ 137 POR Y, DST; \ 138 MOVOU X, TMP; \ 139 PAND Y, TMP; \ 140 PAND Z, DST; \ 141 POR TMP, DST; \ // (a AND b) OR (a AND c) OR (b AND c) 142 143 #define SSE_GG0(X, Y, Z, DST) \ 144 SSE_FF0(X, Y, Z, DST) 145 146 // DST = (Y XOR Z) AND X XOR Z 147 #define SSE_GG1(X, Y, Z, DST) \ 148 MOVOU Y, DST; \ 149 PXOR Z, DST; \ 150 PAND X, DST; \ 151 PXOR Z, DST 152 153 #define SSE_COPY_RESULT(b, d, f, h, TT1, TT2) \ 154 PROLD(b, 9); \ 155 MOVOU TT1, h; \ 156 PROLD(f, 19); \ 157 MOVOU TT2, TT1; \ 158 PROLD(TT1, 9); \ 159 PXOR TT1, TT2; \ // tt2 XOR ROTL(9, tt2) 160 PSHUFB r08_mask<>(SB), TT1; \ // ROTL(17, tt2) 161 PXOR TT2, TT1; \ // tt2 XOR ROTL(9, tt2) XOR ROTL(17, tt2) 162 MOVOU TT1, d 163 164 #define ROUND_00_11(index, a, b, c, d, e, f, g, h) \ 165 SSE_SS1SS2(index, a, e, tmp2, X12, X13); \ 166 SSE_FF0(a, b, c, X14); \ 167 PADDL d, X14; \ // (a XOR b XOR c) + d 168 loadWord(X10, index); \ 169 loadWord(X11, index+4); \ 170 PXOR X10, X11; \ //Wt XOR Wt+4 171 PADDL X11, X14; \ // (a XOR b XOR c) + d + Wt XOR Wt+4 172 PADDL X14, X13; \ // TT1 173 PADDL h, X10; \ // Wt + h 174 PADDL X12, X10; \ // Wt + h + SS1 175 SSE_GG0(e, f, g, X11); \ 176 PADDL X11, X10; \ // TT2 = (e XOR f XOR g) + Wt + h + SS1 177 ; \ // copy result 178 SSE_COPY_RESULT(b, d, f, h, X13, X10) 179 180 #define MESSAGE_SCHEDULE(index) \ 181 loadWord(X10, index+1); \ // Wj-3 182 PROLD(X10, 15); \ 183 loadWord(X11, index-12); \ // Wj-16 184 PXOR X11, X10; \ 185 loadWord(X11, index-5); \ // Wj-9 186 PXOR X11, X10; \ 187 MOVOU X10, X11; \ 188 PROLD(X11, 15); \ 189 PXOR X11, X10; \ 190 PSHUFB r08_mask<>(SB), X11; \ 191 PXOR X11, X10; \ // P1 192 loadWord(X11, index-9); \ // Wj-13 193 PROLD(X11, 7); \ 194 PXOR X11, X10; \ 195 loadWord(X11, index-2); \ // Wj-6 196 PXOR X10, X11; \ 197 storeWord(X11, index+4) 198 199 #define ROUND_12_15(index, a, b, c, d, e, f, g, h) \ 200 MESSAGE_SCHEDULE(index); \ 201 ROUND_00_11(index, a, b, c, d, e, f, g, h) 202 203 #define ROUND_16_63(index, a, b, c, d, e, f, g, h) \ 204 MESSAGE_SCHEDULE(index); \ // X11 is Wt+4 now, Pls do not use it 205 SSE_SS1SS2(index, a, e, tmp2, X12, X13); \ 206 ; \ 207 SSE_FF1(a, b, c, X10, X14); \ 208 PADDL d, X14; \ // (a AND b) OR (a AND c) OR (b AND c) + d 209 loadWord(X10, index); \ 210 PXOR X10, X11; \ //Wt XOR Wt+4 211 PADDL X11, X14; \ // (a AND b) OR (a AND c) OR (b AND c) + d + Wt XOR Wt+4 212 PADDL X14, X13; \ // TT1 213 ; \ 214 PADDL h, X10; \ // Wt + h 215 PADDL X12, X10; \ // Wt + h + SS1 216 SSE_GG1(e, f, g, X11); \ 217 PADDL X11, X10; \ // TT2 = (e XOR f XOR g) + Wt + h + SS1 218 ; \ // copy result 219 SSE_COPY_RESULT(b, d, f, h, X13, X10) 220 221 // transpose matrix function, AVX version 222 // parameters: 223 // - r0: 128 bits register as input/output data 224 // - r1: 128 bits register as input/output data 225 // - r2: 128 bits register as input/output data 226 // - r3: 128 bits register as input/output data 227 // - tmp1: 128 bits temp register 228 // - tmp2: 128 bits temp register 229 #define TRANSPOSE_MATRIX(r0, r1, r2, r3, tmp1, tmp2) \ 230 VPUNPCKHDQ r1, r0, tmp2; \ // tmp2 = tmp2 = [w07, w03, w06, w02] 231 VPUNPCKLDQ r1, r0, r0; \ // r0 = r0 = [w05, w01, w04, w00] 232 VPUNPCKLDQ r3, r2, tmp1; \ // tmp1 = tmp1 = [w13, w09, w12, w08] 233 VPUNPCKHDQ r3, r2, r2; \ // r2 = r2 = [w15, w11, w14, w10] 234 VPUNPCKHQDQ tmp1, r0, r1; \ // r1 = r1 = [w13, w09, w05, w01] 235 VPUNPCKLQDQ tmp1, r0, r0; \ // r0 = r0 = [w12, w08, w04, w00] 236 VPUNPCKHQDQ r2, tmp2, r3; \ // r3 = r3 = [w15, w11, w07, w03] 237 VPUNPCKLQDQ r2, tmp2, r2 // r2 = r2 = [w14, w10, w06, w02] 238 239 #define avxStoreWord(W, j) VMOVDQU W, (128+(j)*16)(BX) 240 #define avxLoadWord(W, i) VMOVDQU (128+(i)*16)(BX), W 241 242 #define avxStoreState(R) \ 243 VMOVDQU a, (0*16)(R) \ 244 VMOVDQU b, (1*16)(R) \ 245 VMOVDQU c, (2*16)(R) \ 246 VMOVDQU d, (3*16)(R) \ 247 VMOVDQU e, (4*16)(R) \ 248 VMOVDQU f, (5*16)(R) \ 249 VMOVDQU g, (6*16)(R) \ 250 VMOVDQU h, (7*16)(R) 251 252 #define AVX_REV32(a, b, c, d) \ 253 VPSHUFB flip_mask<>(SB), a, a; \ 254 VPSHUFB flip_mask<>(SB), b, b; \ 255 VPSHUFB flip_mask<>(SB), c, c; \ 256 VPSHUFB flip_mask<>(SB), d, d 257 258 #define avxPrepare4Words(i) \ 259 VMOVDQU (i*16)(R8), X10; \ 260 VMOVDQU (i*16)(R9), X11; \ 261 VMOVDQU (i*16)(R10), X12; \ 262 VMOVDQU (i*16)(R11), X13; \ 263 ; \ 264 TRANSPOSE_MATRIX(X10, X11, X12, X13, tmp1, tmp2); \ 265 AVX_REV32(X10, X11, X12, X13); \ 266 ; \ 267 avxStoreWord(X10, 4*i+0); \ 268 avxStoreWord(X11, 4*i+1); \ 269 avxStoreWord(X12, 4*i+2); \ 270 avxStoreWord(X13, 4*i+3) 271 272 #define AVX_LOAD_T(index, T) \ 273 MOVL (index*4)(AX), T; \ 274 VPSHUFD $0, T, T 275 276 // r <<< n 277 #define VPROLD(r, n) \ 278 VPSLLD $(n), r, tmp1; \ 279 VPSRLD $(32-n), r, r; \ 280 VPOR tmp1, r, r 281 282 // d = r <<< n 283 #define VPROLD2(r, d, n) \ 284 VPSLLD $(n), r, tmp1; \ 285 VPSRLD $(32-n), r, d; \ 286 VPOR tmp1, d, d 287 288 #define AVX_SS1SS2(index, a, e, SS1, SS2) \ 289 VPROLD2(a, SS2, 12); \ // a <<< 12 290 AVX_LOAD_T(index, SS1); \ 291 VPADDD SS1, SS2, SS1; \ 292 VPADDD e, SS1, SS1; \ 293 VPROLD(SS1, 7); \ // SS1 294 VPXOR SS1, SS2, SS2 295 296 // DST = X XOR Y XOR Z 297 #define AVX_FF0(X, Y, Z, DST) \ 298 VPXOR X, Y, DST; \ 299 VPXOR Z, DST, DST 300 301 // DST = (X AND Y) OR (X AND Z) OR (Y AND Z) 302 #define AVX_FF1(X, Y, Z, TMP, DST) \ 303 VPOR X, Y, DST; \ 304 VPAND X, Y, TMP; \ 305 VPAND Z, DST, DST; \ 306 VPOR TMP, DST, DST 307 308 // DST = X XOR Y XOR Z 309 #define AVX_GG0(X, Y, Z, DST) \ 310 AVX_FF0(X, Y, Z, DST) 311 312 // DST = (Y XOR Z) AND X XOR Z 313 #define AVX_GG1(X, Y, Z, DST) \ 314 VPXOR Y, Z, DST; \ 315 VPAND X, DST, DST; \ 316 VPXOR Z, DST, DST 317 318 #define AVX_COPY_RESULT(b, d, f, h, TT1, TT2) \ 319 VPROLD(b, 9); \ 320 VMOVDQU TT1, h; \ 321 VPROLD(f, 19); \ 322 VPROLD2(TT2, TT1, 9); \ // tt2 <<< 9 323 VPXOR TT2, TT1, TT2; \ // tt2 XOR ROTL(9, tt2) 324 VPSHUFB r08_mask<>(SB), TT1, TT1; \ // ROTL(17, tt2) 325 VPXOR TT2, TT1, d 326 327 #define AVX_ROUND_00_11(index, a, b, c, d, e, f, g, h) \ 328 AVX_SS1SS2(index, a, e, X12, X13); \ 329 ; \ 330 AVX_FF0(a, b, c, X14); \ 331 VPADDD d, X14, X14; \ // (a XOR b XOR c) + d 332 avxLoadWord(X10, index); \ 333 avxLoadWord(X11, index+4); \ 334 VPXOR X10, X11, X11; \ //Wt XOR Wt+4 335 VPADDD X11, X14, X14; \ // (a XOR b XOR c) + d + Wt XOR Wt+4 336 VPADDD X14, X13, X13; \ // TT1 337 VPADDD h, X10, X10; \ // Wt + h 338 VPADDD X12, X10, X10; \ // Wt + h + SS1 339 AVX_GG0(e, f, g, X11); \ 340 VPADDD X11, X10, X10; \ // TT2 = (e XOR f XOR g) + Wt + h + SS1 341 ; \ // copy result 342 AVX_COPY_RESULT(b, d, f, h, X13, X10) 343 344 #define AVX_MESSAGE_SCHEDULE(index) \ 345 avxLoadWord(X10, index+1); \ // Wj-3 346 VPROLD(X10, 15); \ 347 VPXOR (128+(index-12)*16)(BX), X10, X10; \ // Wj-16 348 VPXOR (128+(index-5)*16)(BX), X10, X10; \ // Wj-9 349 ; \ // P1 350 VPROLD2(X10, X11, 15); \ 351 VPXOR X11, X10, X10; \ 352 VPSHUFB r08_mask<>(SB), X11, X11; \ 353 VPXOR X11, X10, X10; \ // P1 354 avxLoadWord(X11, index-9); \ // Wj-13 355 VPROLD(X11, 7); \ 356 VPXOR X11, X10, X10; \ 357 VPXOR (128+(index-2)*16)(BX), X10, X11; \ 358 avxStoreWord(X11, index+4) 359 360 #define AVX_ROUND_12_15(index, a, b, c, d, e, f, g, h) \ 361 AVX_MESSAGE_SCHEDULE(index); \ 362 AVX_ROUND_00_11(index, a, b, c, d, e, f, g, h) 363 364 #define AVX_ROUND_16_63(index, a, b, c, d, e, f, g, h) \ 365 AVX_MESSAGE_SCHEDULE(index); \ // X11 is Wt+4 now, Pls do not use it 366 AVX_SS1SS2(index, a, e, X12, X13); \ 367 ; \ 368 AVX_FF1(a, b, c, X10, X14); \ 369 VPADDD d, X14, X14; \ // (a AND b) OR (a AND c) OR (b AND c) + d 370 avxLoadWord(X10, index); \ 371 VPXOR X10, X11, X11; \ //Wt XOR Wt+4 372 VPADDD X11, X14, X14; \ // (a AND b) OR (a AND c) OR (b AND c) + d + Wt XOR Wt+4 373 VPADDD X14, X13, X13; \ // TT1 374 ; \ 375 VPADDD h, X10, X10; \ // Wt + h 376 VPADDD X12, X10, X10; \ // Wt + h + SS1 377 AVX_GG1(e, f, g, X11); \ 378 VPADDD X11, X10, X10; \ // TT2 = (e XOR f XOR g) + Wt + h + SS1 379 ; \ // copy result 380 AVX_COPY_RESULT(b, d, f, h, X13, X10) 381 382 // blockMultBy4(dig **[8]uint32, p *[]byte, buffer *byte, blocks int) 383 TEXT ·blockMultBy4(SB),NOSPLIT,$0 384 MOVQ dig+0(FP), DI 385 MOVQ p+8(FP), SI 386 MOVQ buffer+16(FP), BX 387 MOVQ blocks+24(FP), DX 388 389 CMPB ·useAVX(SB), $1 390 JE avx 391 392 // load state 393 MOVQ (DI), R8 394 MOVOU (0*16)(R8), a 395 MOVOU (1*16)(R8), e 396 MOVQ 8(DI), R8 397 MOVOU (0*16)(R8), b 398 MOVOU (1*16)(R8), f 399 MOVQ 16(DI), R8 400 MOVOU (0*16)(R8), c 401 MOVOU (1*16)(R8), g 402 MOVQ 24(DI), R8 403 MOVOU (0*16)(R8), d 404 MOVOU (1*16)(R8), h 405 406 // transpose state 407 SSE_TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2) 408 SSE_TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2) 409 410 // store state to temporary buffer 411 storeState(BX) 412 413 MOVQ $·_K+0(SB), AX 414 MOVQ (SI), R8 415 MOVQ 8(SI), R9 416 MOVQ 16(SI), R10 417 MOVQ 24(SI), R11 418 419 loop: 420 // load message block 421 prepare4Words(0) 422 prepare4Words(1) 423 prepare4Words(2) 424 prepare4Words(3) 425 426 ROUND_00_11(0, a, b, c, d, e, f, g, h) 427 ROUND_00_11(1, h, a, b, c, d, e, f, g) 428 ROUND_00_11(2, g, h, a, b, c, d, e, f) 429 ROUND_00_11(3, f, g, h, a, b, c, d, e) 430 ROUND_00_11(4, e, f, g, h, a, b, c, d) 431 ROUND_00_11(5, d, e, f, g, h, a, b, c) 432 ROUND_00_11(6, c, d, e, f, g, h, a, b) 433 ROUND_00_11(7, b, c, d, e, f, g, h, a) 434 ROUND_00_11(8, a, b, c, d, e, f, g, h) 435 ROUND_00_11(9, h, a, b, c, d, e, f, g) 436 ROUND_00_11(10, g, h, a, b, c, d, e, f) 437 ROUND_00_11(11, f, g, h, a, b, c, d, e) 438 439 ROUND_12_15(12, e, f, g, h, a, b, c, d) 440 ROUND_12_15(13, d, e, f, g, h, a, b, c) 441 ROUND_12_15(14, c, d, e, f, g, h, a, b) 442 ROUND_12_15(15, b, c, d, e, f, g, h, a) 443 444 ROUND_16_63(16, a, b, c, d, e, f, g, h) 445 ROUND_16_63(17, h, a, b, c, d, e, f, g) 446 ROUND_16_63(18, g, h, a, b, c, d, e, f) 447 ROUND_16_63(19, f, g, h, a, b, c, d, e) 448 ROUND_16_63(20, e, f, g, h, a, b, c, d) 449 ROUND_16_63(21, d, e, f, g, h, a, b, c) 450 ROUND_16_63(22, c, d, e, f, g, h, a, b) 451 ROUND_16_63(23, b, c, d, e, f, g, h, a) 452 ROUND_16_63(24, a, b, c, d, e, f, g, h) 453 ROUND_16_63(25, h, a, b, c, d, e, f, g) 454 ROUND_16_63(26, g, h, a, b, c, d, e, f) 455 ROUND_16_63(27, f, g, h, a, b, c, d, e) 456 ROUND_16_63(28, e, f, g, h, a, b, c, d) 457 ROUND_16_63(29, d, e, f, g, h, a, b, c) 458 ROUND_16_63(30, c, d, e, f, g, h, a, b) 459 ROUND_16_63(31, b, c, d, e, f, g, h, a) 460 ROUND_16_63(32, a, b, c, d, e, f, g, h) 461 ROUND_16_63(33, h, a, b, c, d, e, f, g) 462 ROUND_16_63(34, g, h, a, b, c, d, e, f) 463 ROUND_16_63(35, f, g, h, a, b, c, d, e) 464 ROUND_16_63(36, e, f, g, h, a, b, c, d) 465 ROUND_16_63(37, d, e, f, g, h, a, b, c) 466 ROUND_16_63(38, c, d, e, f, g, h, a, b) 467 ROUND_16_63(39, b, c, d, e, f, g, h, a) 468 ROUND_16_63(40, a, b, c, d, e, f, g, h) 469 ROUND_16_63(41, h, a, b, c, d, e, f, g) 470 ROUND_16_63(42, g, h, a, b, c, d, e, f) 471 ROUND_16_63(43, f, g, h, a, b, c, d, e) 472 ROUND_16_63(44, e, f, g, h, a, b, c, d) 473 ROUND_16_63(45, d, e, f, g, h, a, b, c) 474 ROUND_16_63(46, c, d, e, f, g, h, a, b) 475 ROUND_16_63(47, b, c, d, e, f, g, h, a) 476 ROUND_16_63(48, a, b, c, d, e, f, g, h) 477 ROUND_16_63(49, h, a, b, c, d, e, f, g) 478 ROUND_16_63(50, g, h, a, b, c, d, e, f) 479 ROUND_16_63(51, f, g, h, a, b, c, d, e) 480 ROUND_16_63(52, e, f, g, h, a, b, c, d) 481 ROUND_16_63(53, d, e, f, g, h, a, b, c) 482 ROUND_16_63(54, c, d, e, f, g, h, a, b) 483 ROUND_16_63(55, b, c, d, e, f, g, h, a) 484 ROUND_16_63(56, a, b, c, d, e, f, g, h) 485 ROUND_16_63(57, h, a, b, c, d, e, f, g) 486 ROUND_16_63(58, g, h, a, b, c, d, e, f) 487 ROUND_16_63(59, f, g, h, a, b, c, d, e) 488 ROUND_16_63(60, e, f, g, h, a, b, c, d) 489 ROUND_16_63(61, d, e, f, g, h, a, b, c) 490 ROUND_16_63(62, c, d, e, f, g, h, a, b) 491 ROUND_16_63(63, b, c, d, e, f, g, h, a) 492 493 MOVOU (0*16)(BX), tmp1 494 PXOR tmp1, a 495 MOVOU (1*16)(BX), tmp1 496 PXOR tmp1, b 497 MOVOU (2*16)(BX), tmp1 498 PXOR tmp1, c 499 MOVOU (3*16)(BX), tmp1 500 PXOR tmp1, d 501 MOVOU (4*16)(BX), tmp1 502 PXOR tmp1, e 503 MOVOU (5*16)(BX), tmp1 504 PXOR tmp1, f 505 MOVOU (6*16)(BX), tmp1 506 PXOR tmp1, g 507 MOVOU (7*16)(BX), tmp1 508 PXOR tmp1, h 509 510 DECQ DX 511 JZ end 512 513 storeState(BX) 514 LEAQ 64(R8), R8 515 LEAQ 64(R9), R9 516 LEAQ 64(R10), R10 517 LEAQ 64(R11), R11 518 JMP loop 519 520 end: 521 // transpose state 522 SSE_TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2) 523 SSE_TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2) 524 525 MOVQ (DI), R8 526 MOVOU a, (0*16)(R8) 527 MOVOU e, (1*16)(R8) 528 MOVQ 8(DI), R8 529 MOVOU b, (0*16)(R8) 530 MOVOU f, (1*16)(R8) 531 MOVQ 16(DI), R8 532 MOVOU c, (0*16)(R8) 533 MOVOU g, (1*16)(R8) 534 MOVQ 24(DI), R8 535 MOVOU d, (0*16)(R8) 536 MOVOU h, (1*16)(R8) 537 538 RET 539 540 avx: 541 // load state 542 MOVQ (DI), R8 543 VMOVDQU (0*16)(R8), a 544 VMOVDQU (1*16)(R8), e 545 MOVQ 8(DI), R8 546 VMOVDQU (0*16)(R8), b 547 VMOVDQU (1*16)(R8), f 548 MOVQ 16(DI), R8 549 VMOVDQU (0*16)(R8), c 550 VMOVDQU (1*16)(R8), g 551 MOVQ 24(DI), R8 552 VMOVDQU (0*16)(R8), d 553 VMOVDQU (1*16)(R8), h 554 555 // transpose state 556 TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2) 557 TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2) 558 559 avxStoreState(BX) 560 561 MOVQ $·_K+0(SB), AX 562 MOVQ (SI), R8 563 MOVQ 8(SI), R9 564 MOVQ 16(SI), R10 565 MOVQ 24(SI), R11 566 567 avxLoop: 568 // load message block 569 avxPrepare4Words(0) 570 avxPrepare4Words(1) 571 avxPrepare4Words(2) 572 avxPrepare4Words(3) 573 574 AVX_ROUND_00_11(0, a, b, c, d, e, f, g, h) 575 AVX_ROUND_00_11(1, h, a, b, c, d, e, f, g) 576 AVX_ROUND_00_11(2, g, h, a, b, c, d, e, f) 577 AVX_ROUND_00_11(3, f, g, h, a, b, c, d, e) 578 AVX_ROUND_00_11(4, e, f, g, h, a, b, c, d) 579 AVX_ROUND_00_11(5, d, e, f, g, h, a, b, c) 580 AVX_ROUND_00_11(6, c, d, e, f, g, h, a, b) 581 AVX_ROUND_00_11(7, b, c, d, e, f, g, h, a) 582 AVX_ROUND_00_11(8, a, b, c, d, e, f, g, h) 583 AVX_ROUND_00_11(9, h, a, b, c, d, e, f, g) 584 AVX_ROUND_00_11(10, g, h, a, b, c, d, e, f) 585 AVX_ROUND_00_11(11, f, g, h, a, b, c, d, e) 586 587 AVX_ROUND_12_15(12, e, f, g, h, a, b, c, d) 588 AVX_ROUND_12_15(13, d, e, f, g, h, a, b, c) 589 AVX_ROUND_12_15(14, c, d, e, f, g, h, a, b) 590 AVX_ROUND_12_15(15, b, c, d, e, f, g, h, a) 591 592 AVX_ROUND_16_63(16, a, b, c, d, e, f, g, h) 593 AVX_ROUND_16_63(17, h, a, b, c, d, e, f, g) 594 AVX_ROUND_16_63(18, g, h, a, b, c, d, e, f) 595 AVX_ROUND_16_63(19, f, g, h, a, b, c, d, e) 596 AVX_ROUND_16_63(20, e, f, g, h, a, b, c, d) 597 AVX_ROUND_16_63(21, d, e, f, g, h, a, b, c) 598 AVX_ROUND_16_63(22, c, d, e, f, g, h, a, b) 599 AVX_ROUND_16_63(23, b, c, d, e, f, g, h, a) 600 AVX_ROUND_16_63(24, a, b, c, d, e, f, g, h) 601 AVX_ROUND_16_63(25, h, a, b, c, d, e, f, g) 602 AVX_ROUND_16_63(26, g, h, a, b, c, d, e, f) 603 AVX_ROUND_16_63(27, f, g, h, a, b, c, d, e) 604 AVX_ROUND_16_63(28, e, f, g, h, a, b, c, d) 605 AVX_ROUND_16_63(29, d, e, f, g, h, a, b, c) 606 AVX_ROUND_16_63(30, c, d, e, f, g, h, a, b) 607 AVX_ROUND_16_63(31, b, c, d, e, f, g, h, a) 608 AVX_ROUND_16_63(32, a, b, c, d, e, f, g, h) 609 AVX_ROUND_16_63(33, h, a, b, c, d, e, f, g) 610 AVX_ROUND_16_63(34, g, h, a, b, c, d, e, f) 611 AVX_ROUND_16_63(35, f, g, h, a, b, c, d, e) 612 AVX_ROUND_16_63(36, e, f, g, h, a, b, c, d) 613 AVX_ROUND_16_63(37, d, e, f, g, h, a, b, c) 614 AVX_ROUND_16_63(38, c, d, e, f, g, h, a, b) 615 AVX_ROUND_16_63(39, b, c, d, e, f, g, h, a) 616 AVX_ROUND_16_63(40, a, b, c, d, e, f, g, h) 617 AVX_ROUND_16_63(41, h, a, b, c, d, e, f, g) 618 AVX_ROUND_16_63(42, g, h, a, b, c, d, e, f) 619 AVX_ROUND_16_63(43, f, g, h, a, b, c, d, e) 620 AVX_ROUND_16_63(44, e, f, g, h, a, b, c, d) 621 AVX_ROUND_16_63(45, d, e, f, g, h, a, b, c) 622 AVX_ROUND_16_63(46, c, d, e, f, g, h, a, b) 623 AVX_ROUND_16_63(47, b, c, d, e, f, g, h, a) 624 AVX_ROUND_16_63(48, a, b, c, d, e, f, g, h) 625 AVX_ROUND_16_63(49, h, a, b, c, d, e, f, g) 626 AVX_ROUND_16_63(50, g, h, a, b, c, d, e, f) 627 AVX_ROUND_16_63(51, f, g, h, a, b, c, d, e) 628 AVX_ROUND_16_63(52, e, f, g, h, a, b, c, d) 629 AVX_ROUND_16_63(53, d, e, f, g, h, a, b, c) 630 AVX_ROUND_16_63(54, c, d, e, f, g, h, a, b) 631 AVX_ROUND_16_63(55, b, c, d, e, f, g, h, a) 632 AVX_ROUND_16_63(56, a, b, c, d, e, f, g, h) 633 AVX_ROUND_16_63(57, h, a, b, c, d, e, f, g) 634 AVX_ROUND_16_63(58, g, h, a, b, c, d, e, f) 635 AVX_ROUND_16_63(59, f, g, h, a, b, c, d, e) 636 AVX_ROUND_16_63(60, e, f, g, h, a, b, c, d) 637 AVX_ROUND_16_63(61, d, e, f, g, h, a, b, c) 638 AVX_ROUND_16_63(62, c, d, e, f, g, h, a, b) 639 AVX_ROUND_16_63(63, b, c, d, e, f, g, h, a) 640 641 VPXOR (0*16)(BX), a, a 642 VPXOR (1*16)(BX), b, b 643 VPXOR (2*16)(BX), c, c 644 VPXOR (3*16)(BX), d, d 645 VPXOR (4*16)(BX), e, e 646 VPXOR (5*16)(BX), f, f 647 VPXOR (6*16)(BX), g, g 648 VPXOR (7*16)(BX), h, h 649 650 DECQ DX 651 JZ avxEnd 652 653 // store current state 654 avxStoreState(BX) 655 656 LEAQ 64(R8), R8 657 LEAQ 64(R9), R9 658 LEAQ 64(R10), R10 659 LEAQ 64(R11), R11 660 JMP avxLoop 661 662 avxEnd: 663 // transpose state 664 TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2) 665 TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2) 666 667 MOVQ (DI), R8 668 VMOVDQU a, (0*16)(R8) 669 VMOVDQU e, (1*16)(R8) 670 MOVQ 8(DI), R8 671 VMOVDQU b, (0*16)(R8) 672 VMOVDQU f, (1*16)(R8) 673 MOVQ 16(DI), R8 674 VMOVDQU c, (0*16)(R8) 675 VMOVDQU g, (1*16)(R8) 676 MOVQ 24(DI), R8 677 VMOVDQU d, (0*16)(R8) 678 VMOVDQU h, (1*16)(R8) 679 680 RET 681 682 // func copyResultsBy4(dig *uint32, dst *byte) 683 TEXT ·copyResultsBy4(SB),NOSPLIT,$0 684 MOVQ dig+0(FP), DI 685 MOVQ dst+8(FP), SI 686 687 CMPB ·useAVX(SB), $1 688 JE avx 689 690 // load state 691 MOVOU (0*16)(DI), a 692 MOVOU (1*16)(DI), b 693 MOVOU (2*16)(DI), c 694 MOVOU (3*16)(DI), d 695 MOVOU (4*16)(DI), e 696 MOVOU (5*16)(DI), f 697 MOVOU (6*16)(DI), g 698 MOVOU (7*16)(DI), h 699 700 SSE_REV32(a, b, c, d) 701 SSE_REV32(e, f, g, h) 702 storeState(SI) 703 704 RET 705 706 avx: 707 // load state 708 VMOVDQU (0*16)(DI), a 709 VMOVDQU (1*16)(DI), b 710 VMOVDQU (2*16)(DI), c 711 VMOVDQU (3*16)(DI), d 712 VMOVDQU (4*16)(DI), e 713 VMOVDQU (5*16)(DI), f 714 VMOVDQU (6*16)(DI), g 715 VMOVDQU (7*16)(DI), h 716 717 AVX_REV32(a, b, c, d) 718 AVX_REV32(e, f, g, h) 719 720 avxStoreState(SI) 721 722 RET