github.com/emmansun/gmsm@v0.29.1/sm3/sm3blocks_avx2_amd64.s (about) 1 // Copyright 2024 Sun Yimin. All rights reserved. 2 // Use of this source code is governed by a MIT-style 3 // license that can be found in the LICENSE file. 4 5 //go:build !purego 6 7 #include "textflag.h" 8 9 // shuffle byte order from LE to BE 10 DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203 11 DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b 12 DATA flip_mask<>+0x10(SB)/8, $0x0405060700010203 13 DATA flip_mask<>+0x18(SB)/8, $0x0c0d0e0f08090a0b 14 GLOBL flip_mask<>(SB), 8, $32 15 16 // left rotations of 32-bit words by 8-bit increments 17 DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003 18 DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B 19 DATA r08_mask<>+0x10(SB)/8, $0x0605040702010003 20 DATA r08_mask<>+0x18(SB)/8, $0x0E0D0C0F0A09080B 21 GLOBL r08_mask<>(SB), 8, $32 22 23 #define a Y0 24 #define b Y1 25 #define c Y2 26 #define d Y3 27 #define e Y4 28 #define f Y5 29 #define g Y6 30 #define h Y7 31 #define TMP1 Y8 32 #define TMP2 Y9 33 #define TMP3 Y10 34 #define TMP4 Y11 35 36 #define srcPtr1 CX 37 #define srcPtr2 R8 38 #define srcPtr3 R9 39 #define srcPtr4 R10 40 #define srcPtr5 R11 41 #define srcPtr6 R12 42 #define srcPtr7 R13 43 #define srcPtr8 R14 44 45 // transpose matrix function, AVX2 version 46 // parameters: 47 // - r0: 256 bits register as input/output data 48 // - r1: 256 bits register as input/output data 49 // - r2: 256 bits register as input/output data 50 // - r3: 256 bits register as input/output data 51 // - r4: 256 bits register as input/output data 52 // - r5: 256 bits register as input/output data 53 // - r6: 256 bits register as input/output data 54 // - r7: 256 bits register as input/output data 55 // - tmp1: 256 bits temp register 56 // - tmp2: 256 bits temp register 57 // - tmp3: 256 bits temp register 58 // - tmp4: 256 bits temp register 59 #define TRANSPOSE_MATRIX(r0, r1, r2, r3, r4, r5, r6, r7, tmp1, tmp2, tmp3, tmp4) \ 60 ; \ // [r0, r1, r2, r3] => [tmp3, tmp4, tmp2, tmp1] 61 VPUNPCKHDQ r1, r0, tmp4; \ // tmp4 = [w15, w7, w14, w6, w11, w3, w10, w2] 62 VPUNPCKLDQ r1, r0, r0; \ // r0 = [w13, w5, w12, w4, w9, w1, w8, w0] 63 VPUNPCKLDQ r3, r2, tmp3; \ // tmp3 = [w29, w21, w28, w20, w25, w17, w24, w16] 64 VPUNPCKHDQ r3, r2, r2; \ // r2 = [w31, w27, w30, w22, w27, w19, w26, w18] 65 VPUNPCKHQDQ tmp3, r0, tmp2; \ // tmp2 = [w29, w21, w13, w5, w25, w17, w9, w1] 66 VPUNPCKLQDQ tmp3, r0, tmp1; \ // tmp1 = [w28, w20, w12, w4, w24, w16, w8, w0] 67 VPUNPCKHQDQ r2, tmp4, tmp3; \ // tmp3 = [w31, w23, w15, w7, w27, w19, w11, w3] 68 VPUNPCKLQDQ r2, tmp4, tmp4; \ // tmp4 = [w30, w22, w14, w6, w26, w18, w10, w2] 69 ; \ // [r4, r5, r6, r7] => [r4, r5, r6, r7] 70 VPUNPCKHDQ r5, r4, r1; \ // r1 = [w47, w39, w46, w38, w43, w35, w42, w34] 71 VPUNPCKLDQ r5, r4, r4; \ // r4 = [w45, w37, w44, w36, w41, w33, w40, w32] 72 VPUNPCKLDQ r7, r6, r0; \ // r0 = [w61, w53, w60, w52, w57, w49, w56, w48] 73 VPUNPCKHDQ r7, r6, r6; \ // r6 = [w63, w59, w52, w54, w59, w51, w58, w50] 74 VPUNPCKHQDQ r0, r4, r5; \ // r5 = [w61, w53, w45, w37, w57, w49, w41, w33] 75 VPUNPCKLQDQ r0, r4, r4; \ // r4 = [w60, w52, w44, w36, w56, w48, w40, w32] 76 VPUNPCKHQDQ r6, r1, r7; \ // r7 = [w63, w55, w47, w39, w59, w51, w43, w35] 77 VPUNPCKLQDQ r6, r1, r6; \ // r6 = [w62, w54, w46, w38, w58, w50, w42, w34] 78 ; \ // [tmp3, tmp4, tmp2, tmp1], [r4, r5, r6, r7] => [r0, r1, r2, r3, r4, r5, r6, r7] 79 VPERM2I128 $0x20, r4, tmp1, r0; \ // r0 = [w56, w48, w40, w32, w24, w16, w8, w0] 80 VPERM2I128 $0x20, r5, tmp2, r1; \ // r1 = [w57, w49, w41, w33, w25, w17, w9, w1] 81 VPERM2I128 $0x20, r6, tmp4, r2; \ // r2 = [w58, w50, w42, w34, w26, w18, w10, w2] 82 VPERM2I128 $0x20, r7, tmp3, r3; \ // r3 = [w59, w51, w43, w35, w27, w19, w11, w3] 83 VPERM2I128 $0x31, r4, tmp1, r4; \ // r4 = [w60, w52, w44, w36, w28, w20, w12, w4] 84 VPERM2I128 $0x31, r5, tmp2, r5; \ // r5 = [w61, w53, w45, w37, w29, w21, w13, w5] 85 VPERM2I128 $0x31, r6, tmp4, r6; \ // r6 = [w62, w54, w46, w38, w30, w22, w14, w6] 86 VPERM2I128 $0x31, r7, tmp3, r7; \ // r7 = [w63, w55, w47, w39, w31, w23, w15, w7] 87 88 // store 256 bits 89 #define storeWord(W, j) VMOVDQU W, (256+(j)*32)(BX) 90 // load 256 bits 91 #define loadWord(W, i) VMOVDQU (256+(i)*32)(BX), W 92 93 #define REV32(a, b, c, d, e, f, g, h) \ 94 VPSHUFB flip_mask<>(SB), a, a; \ 95 VPSHUFB flip_mask<>(SB), b, b; \ 96 VPSHUFB flip_mask<>(SB), c, c; \ 97 VPSHUFB flip_mask<>(SB), d, d; \ 98 VPSHUFB flip_mask<>(SB), e, e; \ 99 VPSHUFB flip_mask<>(SB), f, f; \ 100 VPSHUFB flip_mask<>(SB), g, g; \ 101 VPSHUFB flip_mask<>(SB), h, h 102 103 #define prepare8Words(i) \ 104 VMOVDQU (i*32)(srcPtr1), a; \ 105 VMOVDQU (i*32)(srcPtr2), b; \ 106 VMOVDQU (i*32)(srcPtr3), c; \ 107 VMOVDQU (i*32)(srcPtr4), d; \ 108 VMOVDQU (i*32)(srcPtr5), e; \ 109 VMOVDQU (i*32)(srcPtr6), f; \ 110 VMOVDQU (i*32)(srcPtr7), g; \ 111 VMOVDQU (i*32)(srcPtr8), h; \ 112 ; \ 113 TRANSPOSE_MATRIX(a, b, c, d, e, f, g, h, TMP1, TMP2, TMP3, TMP4); \ 114 REV32(a, b, c, d, e, f, g, h); \ 115 ; \ 116 storeWord(a, 8*i+0); \ 117 storeWord(b, 8*i+1); \ 118 storeWord(c, 8*i+2); \ 119 storeWord(d, 8*i+3); \ 120 storeWord(e, 8*i+4); \ 121 storeWord(f, 8*i+5); \ 122 storeWord(g, 8*i+6); \ 123 storeWord(h, 8*i+7) 124 125 #define saveState(R) \ 126 VMOVDQU a, (0*32)(R); \ 127 VMOVDQU b, (1*32)(R); \ 128 VMOVDQU c, (2*32)(R); \ 129 VMOVDQU d, (3*32)(R); \ 130 VMOVDQU e, (4*32)(R); \ 131 VMOVDQU f, (5*32)(R); \ 132 VMOVDQU g, (6*32)(R); \ 133 VMOVDQU h, (7*32)(R) 134 135 #define loadState(R) \ 136 VMOVDQU (0*32)(R), a; \ 137 VMOVDQU (1*32)(R), b; \ 138 VMOVDQU (2*32)(R), c; \ 139 VMOVDQU (3*32)(R), d; \ 140 VMOVDQU (4*32)(R), e; \ 141 VMOVDQU (5*32)(R), f; \ 142 VMOVDQU (6*32)(R), g; \ 143 VMOVDQU (7*32)(R), h 144 145 // r <<< n 146 #define VPROLD(r, n) \ 147 VPSLLD $(n), r, TMP1; \ 148 VPSRLD $(32-n), r, r; \ 149 VPOR TMP1, r, r 150 151 // d = r <<< n 152 #define VPROLD2(r, d, n) \ 153 VPSLLD $(n), r, TMP1; \ 154 VPSRLD $(32-n), r, d; \ 155 VPOR TMP1, d, d 156 157 #define LOAD_T(index, T) \ 158 VPBROADCASTD (index*4)(AX), T 159 160 // DST = X XOR Y XOR Z 161 #define FF0(X, Y, Z, DST) \ 162 VPXOR X, Y, DST; \ 163 VPXOR Z, DST, DST 164 165 // DST = (X AND Y) OR (X AND Z) OR (Y AND Z) 166 #define FF1(X, Y, Z, TMP, DST) \ 167 VPOR X, Y, DST; \ 168 VPAND X, Y, TMP; \ 169 VPAND Z, DST, DST; \ 170 VPOR TMP, DST, DST 171 172 // DST = X XOR Y XOR Z 173 #define GG0(X, Y, Z, DST) \ 174 FF0(X, Y, Z, DST) 175 176 // DST = (Y XOR Z) AND X XOR Z 177 #define GG1(X, Y, Z, DST) \ 178 VPXOR Y, Z, DST; \ 179 VPAND X, DST, DST; \ 180 VPXOR Z, DST, DST 181 182 #define SS1SS2(index, a, e, SS1, SS2) \ 183 VPROLD2(a, SS2, 12); \ // a <<< 12 184 LOAD_T(index, SS1); \ // const 185 VPADDD SS1, SS2, SS1; \ 186 VPADDD e, SS1, SS1; \ 187 VPROLD(SS1, 7); \ // SS1 188 VPXOR SS1, SS2, SS2; \ // SS2 189 190 #define COPY_RESULT(b, d, f, h, TT1, TT2) \ 191 VPROLD(b, 9); \ 192 VMOVDQU TT1, h; \ // TT1 193 VPROLD(f, 19); \ 194 VPROLD2(TT2, TT1, 9); \ // tt2 <<< 9 195 VPXOR TT2, TT1, TT2; \ // tt2 XOR ROTL(9, tt2) 196 VPSHUFB r08_mask<>(SB), TT1, TT1; \ // ROTL(17, tt2) 197 VPXOR TT1, TT2, d 198 199 #define ROUND_00_11(index, a, b, c, d, e, f, g, h) \ 200 SS1SS2(index, a, e, Y12, Y13); \ 201 ; \ 202 FF0(a, b, c, Y14); \ 203 VPADDD d, Y14, Y14; \ // (a XOR b XOR c) + d 204 loadWord(Y10, index); \ 205 loadWord(Y11, index+4); \ 206 VPXOR Y10, Y11, Y11; \ //Wt XOR Wt+4 207 VPADDD Y11, Y14, Y14; \ // (a XOR b XOR c) + d + Wt XOR Wt+4 208 VPADDD Y14, Y13, Y13; \ // TT1 209 VPADDD h, Y10, Y10; \ // Wt + h 210 VPADDD Y12, Y10, Y10; \ // Wt + h + SS1 211 GG0(e, f, g, Y11); \ 212 VPADDD Y11, Y10, Y10; \ // TT2 = (e XOR f XOR g) + Wt + h + SS1 213 ; \ // copy result 214 COPY_RESULT(b, d, f, h, Y13, Y10) 215 216 #define MESSAGE_SCHEDULE(index) \ 217 loadWord(Y10, index+1); \ // Wj-3 218 VPROLD(Y10, 15); \ 219 VPXOR (256+(index-12)*32)(BX), Y10, Y10; \ // Wj-16 220 VPXOR (256+(index-5)*32)(BX), Y10, Y10; \ // Wj-9 221 ; \ // P1 222 VPROLD2(Y10, Y11, 15); \ 223 VPXOR Y11, Y10, Y10; \ 224 VPSHUFB r08_mask<>(SB), Y11, Y11; \ 225 VPXOR Y11, Y10, Y10; \ // P1 226 loadWord(Y11, index-9); \ // Wj-13 227 VPROLD(Y11, 7); \ 228 VPXOR Y11, Y10, Y10; \ 229 VPXOR (256+(index-2)*32)(BX), Y10, Y11; \ 230 storeWord(Y11, index+4) 231 232 #define ROUND_12_15(index, a, b, c, d, e, f, g, h) \ 233 MESSAGE_SCHEDULE(index); \ 234 ROUND_00_11(index, a, b, c, d, e, f, g, h) 235 236 #define ROUND_16_63(index, a, b, c, d, e, f, g, h) \ 237 MESSAGE_SCHEDULE(index); \ // Y11 is Wt+4 now, Pls do not use it 238 SS1SS2(index, a, e, Y12, Y13); \ 239 ; \ 240 FF1(a, b, c, Y10, Y14); \ // (a AND b) OR (a AND c) OR (b AND c) 241 VPADDD d, Y14, Y14; \ // (a AND b) OR (a AND c) OR (b AND c) + d 242 loadWord(Y10, index); \ 243 VPXOR Y10, Y11, Y11; \ //Wt XOR Wt+4 244 VPADDD Y11, Y14, Y14; \ // (a AND b) OR (a AND c) OR (b AND c) + d + Wt XOR Wt+4 245 VPADDD Y14, Y13, Y13; \ // TT1 246 ; \ 247 VPADDD h, Y10, Y10; \ // Wt + h 248 VPADDD Y12, Y10, Y10; \ // Wt + h + SS1 249 GG1(e, f, g, Y11); \ 250 VPADDD Y11, Y10, Y10; \ // TT2 = (e XOR f XOR g) + Wt + h + SS1 251 ; \ // copy result 252 COPY_RESULT(b, d, f, h, Y13, Y10) 253 254 // transposeMatrix8x8(dig **[8]uint32) 255 TEXT ·transposeMatrix8x8(SB),NOSPLIT,$0 256 MOVQ dig+0(FP), DI 257 258 // load state 259 MOVQ (DI), R8 260 VMOVDQU (R8), a 261 MOVQ 8(DI), R8 262 VMOVDQU (R8), b 263 MOVQ 16(DI), R8 264 VMOVDQU (R8), c 265 MOVQ 24(DI), R8 266 VMOVDQU (R8), d 267 MOVQ 32(DI), R8 268 VMOVDQU (R8), e 269 MOVQ 40(DI), R8 270 VMOVDQU (R8), f 271 MOVQ 48(DI), R8 272 VMOVDQU (R8), g 273 MOVQ 56(DI), R8 274 VMOVDQU (R8), h 275 276 TRANSPOSE_MATRIX(a, b, c, d, e, f, g, h, TMP1, TMP2, TMP3, TMP4) 277 278 // save state 279 MOVQ (DI), R8 280 VMOVDQU a, (R8) 281 MOVQ 8(DI), R8 282 VMOVDQU b, (R8) 283 MOVQ 16(DI), R8 284 VMOVDQU c, (R8) 285 MOVQ 24(DI), R8 286 VMOVDQU d, (R8) 287 MOVQ 32(DI), R8 288 VMOVDQU e, (R8) 289 MOVQ 40(DI), R8 290 VMOVDQU f, (R8) 291 MOVQ 48(DI), R8 292 VMOVDQU g, (R8) 293 MOVQ 56(DI), R8 294 VMOVDQU h, (R8) 295 296 VZEROUPPER 297 298 RET 299 300 // blockMultBy8(dig **[8]uint32, p *[]byte, buffer *byte, blocks int) 301 TEXT ·blockMultBy8(SB),NOSPLIT,$0 302 MOVQ dig+0(FP), DI 303 MOVQ p+8(FP), SI 304 MOVQ buffer+16(FP), BX 305 MOVQ blocks+24(FP), DX 306 307 // load state 308 MOVQ (DI), R8 309 VMOVDQU (R8), a 310 MOVQ 8(DI), R8 311 VMOVDQU (R8), b 312 MOVQ 16(DI), R8 313 VMOVDQU (R8), c 314 MOVQ 24(DI), R8 315 VMOVDQU (R8), d 316 MOVQ 32(DI), R8 317 VMOVDQU (R8), e 318 MOVQ 40(DI), R8 319 VMOVDQU (R8), f 320 MOVQ 48(DI), R8 321 VMOVDQU (R8), g 322 MOVQ 56(DI), R8 323 VMOVDQU (R8), h 324 325 TRANSPOSE_MATRIX(a, b, c, d, e, f, g, h, TMP1, TMP2, TMP3, TMP4) 326 327 saveState(BX) 328 329 MOVQ $·_K+0(SB), AX 330 MOVQ (0*8)(SI), srcPtr1 331 MOVQ (1*8)(SI), srcPtr2 332 MOVQ (2*8)(SI), srcPtr3 333 MOVQ (3*8)(SI), srcPtr4 334 MOVQ (4*8)(SI), srcPtr5 335 MOVQ (5*8)(SI), srcPtr6 336 MOVQ (6*8)(SI), srcPtr7 337 MOVQ (7*8)(SI), srcPtr8 338 339 loop: 340 prepare8Words(0) 341 prepare8Words(1) 342 343 // Need to load state again due to YMM registers are used in prepare8Words 344 loadState(BX) 345 346 ROUND_00_11(0, a, b, c, d, e, f, g, h) 347 ROUND_00_11(1, h, a, b, c, d, e, f, g) 348 ROUND_00_11(2, g, h, a, b, c, d, e, f) 349 ROUND_00_11(3, f, g, h, a, b, c, d, e) 350 ROUND_00_11(4, e, f, g, h, a, b, c, d) 351 ROUND_00_11(5, d, e, f, g, h, a, b, c) 352 ROUND_00_11(6, c, d, e, f, g, h, a, b) 353 ROUND_00_11(7, b, c, d, e, f, g, h, a) 354 ROUND_00_11(8, a, b, c, d, e, f, g, h) 355 ROUND_00_11(9, h, a, b, c, d, e, f, g) 356 ROUND_00_11(10, g, h, a, b, c, d, e, f) 357 ROUND_00_11(11, f, g, h, a, b, c, d, e) 358 359 ROUND_12_15(12, e, f, g, h, a, b, c, d) 360 ROUND_12_15(13, d, e, f, g, h, a, b, c) 361 ROUND_12_15(14, c, d, e, f, g, h, a, b) 362 ROUND_12_15(15, b, c, d, e, f, g, h, a) 363 364 ROUND_16_63(16, a, b, c, d, e, f, g, h) 365 ROUND_16_63(17, h, a, b, c, d, e, f, g) 366 ROUND_16_63(18, g, h, a, b, c, d, e, f) 367 ROUND_16_63(19, f, g, h, a, b, c, d, e) 368 ROUND_16_63(20, e, f, g, h, a, b, c, d) 369 ROUND_16_63(21, d, e, f, g, h, a, b, c) 370 ROUND_16_63(22, c, d, e, f, g, h, a, b) 371 ROUND_16_63(23, b, c, d, e, f, g, h, a) 372 ROUND_16_63(24, a, b, c, d, e, f, g, h) 373 ROUND_16_63(25, h, a, b, c, d, e, f, g) 374 ROUND_16_63(26, g, h, a, b, c, d, e, f) 375 ROUND_16_63(27, f, g, h, a, b, c, d, e) 376 ROUND_16_63(28, e, f, g, h, a, b, c, d) 377 ROUND_16_63(29, d, e, f, g, h, a, b, c) 378 ROUND_16_63(30, c, d, e, f, g, h, a, b) 379 ROUND_16_63(31, b, c, d, e, f, g, h, a) 380 ROUND_16_63(32, a, b, c, d, e, f, g, h) 381 ROUND_16_63(33, h, a, b, c, d, e, f, g) 382 ROUND_16_63(34, g, h, a, b, c, d, e, f) 383 ROUND_16_63(35, f, g, h, a, b, c, d, e) 384 ROUND_16_63(36, e, f, g, h, a, b, c, d) 385 ROUND_16_63(37, d, e, f, g, h, a, b, c) 386 ROUND_16_63(38, c, d, e, f, g, h, a, b) 387 ROUND_16_63(39, b, c, d, e, f, g, h, a) 388 ROUND_16_63(40, a, b, c, d, e, f, g, h) 389 ROUND_16_63(41, h, a, b, c, d, e, f, g) 390 ROUND_16_63(42, g, h, a, b, c, d, e, f) 391 ROUND_16_63(43, f, g, h, a, b, c, d, e) 392 ROUND_16_63(44, e, f, g, h, a, b, c, d) 393 ROUND_16_63(45, d, e, f, g, h, a, b, c) 394 ROUND_16_63(46, c, d, e, f, g, h, a, b) 395 ROUND_16_63(47, b, c, d, e, f, g, h, a) 396 ROUND_16_63(48, a, b, c, d, e, f, g, h) 397 ROUND_16_63(49, h, a, b, c, d, e, f, g) 398 ROUND_16_63(50, g, h, a, b, c, d, e, f) 399 ROUND_16_63(51, f, g, h, a, b, c, d, e) 400 ROUND_16_63(52, e, f, g, h, a, b, c, d) 401 ROUND_16_63(53, d, e, f, g, h, a, b, c) 402 ROUND_16_63(54, c, d, e, f, g, h, a, b) 403 ROUND_16_63(55, b, c, d, e, f, g, h, a) 404 ROUND_16_63(56, a, b, c, d, e, f, g, h) 405 ROUND_16_63(57, h, a, b, c, d, e, f, g) 406 ROUND_16_63(58, g, h, a, b, c, d, e, f) 407 ROUND_16_63(59, f, g, h, a, b, c, d, e) 408 ROUND_16_63(60, e, f, g, h, a, b, c, d) 409 ROUND_16_63(61, d, e, f, g, h, a, b, c) 410 ROUND_16_63(62, c, d, e, f, g, h, a, b) 411 ROUND_16_63(63, b, c, d, e, f, g, h, a) 412 413 VPXOR (0*32)(BX), a, a 414 VPXOR (1*32)(BX), b, b 415 VPXOR (2*32)(BX), c, c 416 VPXOR (3*32)(BX), d, d 417 VPXOR (4*32)(BX), e, e 418 VPXOR (5*32)(BX), f, f 419 VPXOR (6*32)(BX), g, g 420 VPXOR (7*32)(BX), h, h 421 422 DECQ DX 423 JZ end 424 425 saveState(BX) 426 LEAQ 64(srcPtr1), srcPtr1 427 LEAQ 64(srcPtr2), srcPtr2 428 LEAQ 64(srcPtr3), srcPtr3 429 LEAQ 64(srcPtr4), srcPtr4 430 LEAQ 64(srcPtr5), srcPtr5 431 LEAQ 64(srcPtr6), srcPtr6 432 LEAQ 64(srcPtr7), srcPtr7 433 LEAQ 64(srcPtr8), srcPtr8 434 435 JMP loop 436 437 end: 438 TRANSPOSE_MATRIX(a, b, c, d, e, f, g, h, TMP1, TMP2, TMP3, TMP4) 439 440 // save state 441 MOVQ (DI), R8 442 VMOVDQU a, (R8) 443 MOVQ 8(DI), R8 444 VMOVDQU b, (R8) 445 MOVQ 16(DI), R8 446 VMOVDQU c, (R8) 447 MOVQ 24(DI), R8 448 VMOVDQU d, (R8) 449 MOVQ 32(DI), R8 450 VMOVDQU e, (R8) 451 MOVQ 40(DI), R8 452 VMOVDQU f, (R8) 453 MOVQ 48(DI), R8 454 VMOVDQU g, (R8) 455 MOVQ 56(DI), R8 456 VMOVDQU h, (R8) 457 458 VZEROUPPER 459 RET 460 461 // func copyResultsBy8(dig *uint32, dst *byte) 462 TEXT ·copyResultsBy8(SB),NOSPLIT,$0 463 MOVQ dig+0(FP), DI 464 MOVQ dst+8(FP), SI 465 466 loadState(DI) 467 REV32(a, b, c, d, e, f, g, h) 468 saveState(SI) 469 470 VZEROUPPER 471 RET