github.com/emmansun/gmsm@v0.29.1/sm3/sm3block_simd_amd64.s (about) 1 //go:build !purego 2 3 #include "textflag.h" 4 5 #include "sm3_const_asm.s" 6 // Definitions for AVX version 7 8 // xorm (mem), reg 9 // Xor reg to mem using reg-mem xor and store 10 #define xorm(P1, P2) \ 11 XORL P2, P1; \ 12 MOVL P1, P2 13 14 #define XWORD0 X4 15 #define XWORD1 X5 16 #define XWORD2 X6 17 #define XWORD3 X7 18 19 #define XTMP0 X0 20 #define XTMP1 X1 21 #define XTMP2 X2 22 #define XTMP3 X3 23 #define XTMP4 X8 24 25 #define XFER X9 26 #define R08_SHUFFLE_MASK X10 27 #define X_BYTE_FLIP_MASK X13 // mask to convert LE -> BE 28 29 #define NUM_BYTES DX 30 #define INP DI 31 32 #define CTX SI // Beginning of digest in memory (a, b, c, ... , h) 33 34 #define a AX 35 #define b BX 36 #define c CX 37 #define d R8 38 #define e DX 39 #define f R9 40 #define g R10 41 #define h R11 42 43 #define y0 R12 44 #define y1 R13 45 #define y2 R14 46 47 // Offsets 48 #define XFER_SIZE 2*16 49 #define INP_END_SIZE 8 50 51 #define _XFER 0 52 #define _INP_END _XFER + XFER_SIZE 53 #define STACK_SIZE _INP_END + INP_END_SIZE 54 55 #define SS12(a, e, const, ss1, ss2) \ 56 MOVL a, ss2; \ 57 ROLL $12, ss2; \ // y0 = a <<< 12 58 MOVL e, ss1; \ 59 ADDL $const, ss1; \ 60 ADDL ss2, ss1; \ // y2 = a <<< 12 + e + T 61 ROLL $7, ss1; \ // y2 = SS1 62 XORL ss1, ss2 63 64 #define P0(tt2, tmp, out) \ 65 MOVL tt2, tmp; \ 66 ROLL $9, tmp; \ 67 MOVL tt2, out; \ 68 ROLL $17, out; \ 69 XORL tmp, out; \ 70 XORL tt2, out 71 72 // For rounds [0 - 16) 73 #define ROUND_AND_SCHED_N_0_0(disp, const, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3) \ 74 ; \ // ############################# RND N + 0 ############################// 75 MOVL a, y0; \ 76 ROLL $12, y0; \ // y0 = a <<< 12 77 MOVL e, y2; \ 78 ADDL $const, y2; \ 79 VPALIGNR $12, XWORD0, XWORD1, XTMP0; \ // XTMP0 = W[-13] = {w6,w5,w4,w3} 80 ADDL y0, y2; \ // y2 = a <<< 12 + e + T 81 ROLL $7, y2; \ // y2 = SS1 82 XORL y2, y0 \ // y0 = SS2 83 VPSLLD $7, XTMP0, XTMP1; \ // XTMP1 = W[-13] << 7 = {w6<<7,w5<<7,w4<<7,w3<<7} 84 ADDL (disp + 0*4)(SP), y2; \ // y2 = SS1 + W 85 ADDL h, y2; \ // y2 = h + SS1 + W 86 ADDL (disp + 0*4 + 16)(SP), y0; \ // y0 = SS2 + W' 87 VPSRLD $(32-7), XTMP0, XTMP0; \ // XTMP0 = W[-13] >> 25 = {w6>>25,w5>>25,w4>>25,w3>>25} 88 ADDL d, y0; \ // y0 = d + SS2 + W' 89 MOVL a, h; \ 90 XORL b, h; \ 91 VPOR XTMP0, XTMP1, XTMP1; \ // XTMP1 = W[-13] rol 7 92 XORL c, h; \ 93 ADDL y0, h; \ // h = FF(a, b, c) + d + SS2 + W' = tt1 94 MOVL e, y1; \ 95 VPALIGNR $8, XWORD2, XWORD3, XTMP0; \ // XTMP0 = W[-6] = {w13,w12,w11,w10} 96 XORL f, y1; \ 97 XORL g, y1; \ 98 ADDL y1, y2; \ // y2 = GG(e, f, g) + h + SS1 + W = tt2 99 VPXOR XTMP1, XTMP0, XTMP0; \ // XTMP0 = W[-6] ^ (W[-13] rol 7) 100 ROLL $9, b; \ 101 ROLL $19, f; \ 102 MOVL y2, y0; \ 103 ROLL $9, y0; \ 104 VPALIGNR $12, XWORD1, XWORD2, XTMP1; \ // XTMP1 = W[-9] = {w10,w9,w8,w7} 105 MOVL y2, d; \ 106 ROLL $17, d; \ 107 XORL y0, d; \ 108 XORL y2, d; \ // d = P(tt2) 109 VPXOR XWORD0, XTMP1, XTMP1; \ // XTMP1 = W[-9] ^ W[-16] 110 111 #define ROUND_AND_SCHED_N_0_1(disp, const, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3) \ 112 ; \ // ############################# RND N + 1 ############################// 113 MOVL a, y0; \ 114 ROLL $12, y0; \ // y0 = a <<< 12 115 MOVL e, y2; \ 116 ADDL $const, y2; \ 117 VPSHUFD $0xA5, XWORD3, XTMP2; \ // XTMP2 = W[-3] {BBAA} {w14,w14,w13,w13} 118 ADDL y0, y2; \ // y2 = a <<< 12 + e + T 119 ROLL $7, y2; \ // y2 = SS1 120 XORL y2, y0 \ // y0 = SS2 121 ADDL (disp + 1*4)(SP), y2; \ // y2 = SS1 + W 122 VPSRLQ $17, XTMP2, XTMP2; \ // XTMP2 = W[-3] rol 15 {xBxA} 123 ADDL h, y2; \ // y2 = h + SS1 + W 124 ADDL (disp + 1*4 + 16)(SP), y0; \ // y0 = SS2 + W' 125 ADDL d, y0; \ // y0 = d + SS2 + W' 126 VPXOR XTMP1, XTMP2, XTMP2; \ // XTMP2 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {xxxA} 127 MOVL a, h; \ 128 XORL b, h; \ 129 XORL c, h; \ 130 ADDL y0, h; \ // h = FF(a, b, c) + d + SS2 + W' = tt1 131 VPSHUFD $0x00, XTMP2, XTMP2; \ // XTMP2 = {AAAA} 132 MOVL e, y1; \ 133 XORL f, y1; \ 134 XORL g, y1; \ 135 ADDL y1, y2; \ // y2 = GG(e, f, g) + h + SS1 + W = tt2 136 VPSRLQ $17, XTMP2, XTMP3; \ // XTMP3 = XTMP2 rol 15 {xxxA} 137 ROLL $9, b; \ 138 ROLL $19, f; \ 139 MOVL y2, y0; \ 140 ROLL $9, y0; \ 141 VPSRLQ $9, XTMP2, XTMP4; \ // XTMP4 = XTMP2 rol 23 {xxxA} 142 MOVL y2, d; \ 143 ROLL $17, d; \ 144 XORL y0, d; \ 145 XORL y2, d; \ // d = P(tt2) 146 VPXOR XTMP2, XTMP4, XTMP4; \ // XTMP4 = XTMP2 ^ (XTMP2 rol 23 {xxxA}) 147 148 #define ROUND_AND_SCHED_N_0_2(disp, const, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3) \ 149 ; \ // ############################# RND N + 2 ############################// 150 MOVL a, y0; \ 151 ROLL $12, y0; \ // y0 = a <<< 12 152 MOVL e, y2; \ 153 ADDL $const, y2; \ 154 VPXOR XTMP4, XTMP3, XTMP4; \ // XTMP4 = XTMP2 ^ (XTMP2 rol 15 {xxxA}) ^ (XTMP2 rol 23 {xxxA}) 155 ADDL y0, y2; \ // y2 = a <<< 12 + e + T 156 ROLL $7, y2; \ // y2 = SS1 157 XORL y2, y0 \ // y0 = SS2 158 ADDL (disp + 2*4)(SP), y2; \ // y2 = SS1 + W 159 VPXOR XTMP4, XTMP0, XTMP2; \ // XTMP2 = {..., ..., ..., W[0]} 160 ADDL h, y2; \ // y2 = h + SS1 + W 161 ADDL (disp + 2*4 + 16)(SP), y0; \ // y0 = SS2 + W' 162 ADDL d, y0; \ // y0 = d + SS2 + W' 163 VPALIGNR $4, XWORD3, XTMP2, XTMP3; \ // XTMP3 = {W[0], w15, w14, w13} 164 MOVL a, h; \ 165 XORL b, h; \ 166 XORL c, h; \ 167 VPSLLD $15, XTMP3, XTMP4; \ 168 ADDL y0, h; \ // h = FF(a, b, c) + d + SS2 + W' = tt1 169 MOVL e, y1; \ 170 XORL f, y1; \ 171 XORL g, y1; \ 172 VPSRLD $(32-15), XTMP3, XTMP3; \ 173 ADDL y1, y2; \ // y2 = GG(e, f, g) + h + SS1 + W = tt2 174 ROLL $9, b; \ 175 ROLL $19, f; \ 176 MOVL y2, y0; \ 177 ROLL $9, y0; \ 178 VPOR XTMP3, XTMP4, XTMP4; \ // XTMP4 = (W[-3] rol 15) {DCxx} 179 MOVL y2, d; \ 180 ROLL $17, d; \ 181 XORL y0, d; \ 182 XORL y2, d; \ // d = P(tt2) 183 VPXOR XTMP1, XTMP4, XTMP4; \ // XTMP4 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {DCxx} 184 185 #define ROUND_AND_SCHED_N_0_3(disp, const, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3) \ 186 ; \ // ############################# RND N + 3 ############################// 187 MOVL a, y0; \ 188 ROLL $12, y0; \ // y0 = a <<< 12 189 MOVL e, y2; \ 190 ADDL $const, y2; \ 191 VPSLLD $15, XTMP4, XTMP2; \ 192 ADDL y0, y2; \ // y2 = a <<< 12 + e + T 193 ROLL $7, y2; \ // y2 = SS1 194 XORL y2, y0 \ // y0 = SS2 195 ADDL (disp + 3*4)(SP), y2; \ // y2 = SS1 + W 196 VPSRLD $(32-15), XTMP4, XTMP3; \ 197 ADDL h, y2; \ // y2 = h + SS1 + W 198 ADDL (disp + 3*4 + 16)(SP), y0; \ // y2 = SS2 + W' 199 ADDL d, y0; \ // y0 = d + SS2 + W' 200 VPOR XTMP3, XTMP2, XTMP3; \ // XTMP3 = XTMP4 rol 15 {DCxx} 201 MOVL a, h; \ 202 XORL b, h; \ 203 XORL c, h; \ 204 VPSHUFB R08_SHUFFLE_MASK, XTMP3, XTMP1; \ // XTMP1 = XTMP4 rol 23 {DCxx} 205 ADDL y0, h; \ // h = FF(a, b, c) + d + SS2 + W' = tt1 206 MOVL e, y1; \ 207 XORL f, y1; \ 208 XORL g, y1; \ 209 VPXOR XTMP3, XTMP4, XTMP3; \ // XTMP3 = XTMP4 ^ (XTMP4 rol 15 {DCxx}) 210 ADDL y1, y2; \ // y2 = GG(e, f, g) + h + SS1 + W = tt2 211 ROLL $9, b; \ 212 ROLL $19, f; \ 213 MOVL y2, y0; \ 214 ROLL $9, y0; \ 215 VPXOR XTMP3, XTMP1, XTMP1; \ // XTMP1 = XTMP4 ^ (XTMP4 rol 15 {DCxx}) ^ (XTMP4 rol 23 {DCxx}) 216 MOVL y2, d; \ 217 ROLL $17, d; \ 218 XORL y0, d; \ 219 XORL y2, d; \ // d = P(tt2) 220 VPXOR XTMP1, XTMP0, XWORD0; \ // XWORD0 = {W[3], W[2], W[1], W[0]} 221 222 // For rounds [16 - 64) 223 #define ROUND_AND_SCHED_N_1_0(disp, const, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3) \ 224 ; \ // ############################# RND N + 0 ############################// 225 MOVL a, y0; \ 226 ROLL $12, y0; \ // y0 = a <<< 12 227 MOVL e, y2; \ 228 ADDL $const, y2; \ 229 VPALIGNR $12, XWORD0, XWORD1, XTMP0; \ // XTMP0 = W[-13] = {w6,w5,w4,w3} 230 ADDL y0, y2; \ // y2 = a <<< 12 + e + T 231 ROLL $7, y2; \ // y2 = SS1 232 XORL y2, y0 \ // y0 = SS2 233 VPSLLD $7, XTMP0, XTMP1; \ // XTMP1 = W[-13] << 7 = {w6<<7,w5<<7,w4<<7,w3<<7} 234 ADDL (disp + 0*4)(SP), y2; \ // y2 = SS1 + W 235 ADDL h, y2; \ // y2 = h + SS1 + W 236 ADDL (disp + 0*4 + 16)(SP), y0; \ // y0 = SS2 + W' 237 VPSRLD $(32-7), XTMP0, XTMP0; \ // XTMP0 = W[-13] >> 25 = {w6>>25,w5>>25,w4>>25,w3>>25} 238 ADDL d, y0; \ // y0 = d + SS2 + W' 239 MOVL a, y1; \ 240 ORL b, y1; \ 241 VPOR XTMP0, XTMP1, XTMP1; \ // XTMP1 = W[-13] rol 7 = {ROTL(7,w6),ROTL(7,w5),ROTL(7,w4),ROTL(7,w3)} 242 MOVL a, h; \ 243 ANDL b, h; \ 244 ANDL c, y1; \ 245 ORL y1, h; \ // h = (a AND b) OR (a AND c) OR (b AND c) 246 VPALIGNR $8, XWORD2, XWORD3, XTMP0; \ // XTMP0 = W[-6] = {w13,w12,w11,w10} 247 ADDL y0, h; \ // h = FF(a, b, c) + d + SS2 + W' = tt1 248 MOVL f, y1; \ 249 XORL g, y1; \ 250 ANDL e, y1; \ 251 VPXOR XTMP1, XTMP0, XTMP0; \ // XTMP0 = W[-6] ^ (W[-13] rol 7) 252 XORL g, y1; \ // y1 = GG2(e, f, g) = (e AND f) OR (NOT(e) AND g) 253 ADDL y1, y2; \ // y2 = GG2(e, f, g) + h + SS1 + W = tt2 254 ROLL $9, b; \ 255 ROLL $19, f; \ 256 VPALIGNR $12, XWORD1, XWORD2, XTMP1; \ // XTMP1 = W[-9] = {w10,w9,w8,w7} 257 P0(y2, y0, d); \ 258 VPXOR XWORD0, XTMP1, XTMP1; \ // XTMP1 = W[-9] ^ W[-16] 259 260 #define ROUND_AND_SCHED_N_1_1(disp, const, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3) \ 261 ; \ // ############################# RND N + 1 ############################// 262 MOVL a, y0; \ 263 ROLL $12, y0; \ // y0 = a <<< 12 264 MOVL e, y2; \ 265 ADDL $const, y2; \ 266 ADDL y0, y2; \ // y2 = a <<< 12 + e + T 267 VPSHUFD $0xA5, XWORD3, XTMP2; \ // XTMP2 = W[-3] {BBAA} {w14,w14,w13,w13} 268 ROLL $7, y2; \ // y2 = SS1 269 XORL y2, y0 \ // y0 = SS2 270 ADDL (disp + 1*4)(SP), y2; \ // y2 = SS1 + W 271 ADDL h, y2; \ // y2 = h + SS1 + W 272 VPSRLQ $17, XTMP2, XTMP2; \ // XTMP2 = W[-3] rol 15 {xBxA} 273 ADDL (disp + 1*4 + 16)(SP), y0; \ // y0 = SS2 + W' 274 ADDL d, y0; \ // y0 = d + SS2 + W' 275 MOVL a, y1; \ 276 ORL b, y1; \ 277 VPXOR XTMP1, XTMP2, XTMP2; \ // XTMP2 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {xxxA} 278 MOVL a, h; \ 279 ANDL b, h; \ 280 ANDL c, y1; \ 281 ORL y1, h; \ // h = (a AND b) OR (a AND c) OR (b AND c) 282 VPSHUFD $0x00, XTMP2, XTMP2; \ // XTMP2 = {AAAA} 283 ADDL y0, h; \ // h = FF(a, b, c) + d + SS2 + W' = tt1 284 MOVL f, y1; \ 285 XORL g, y1; \ 286 ANDL e, y1; \ 287 VPSRLQ $17, XTMP2, XTMP3; \ // XTMP3 = XTMP2 rol 15 {xxxA} 288 XORL g, y1; \ // y1 = GG2(e, f, g) = (e AND f) OR (NOT(e) AND g) 289 ADDL y1, y2; \ // y2 = GG2(e, f, g) + h + SS1 + W = tt2 290 ROLL $9, b; \ 291 ROLL $19, f; \ 292 VPSRLQ $9, XTMP2, XTMP4; \ // XTMP4 = XTMP2 rol 23 {xxxA} 293 P0(y2, y0, d); \ 294 VPXOR XTMP2, XTMP4, XTMP4; \ // XTMP4 = XTMP2 XOR (XTMP2 rol 23 {xxxA}) 295 296 #define ROUND_AND_SCHED_N_1_2(disp, const, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3) \ 297 ; \ // ############################# RND N + 2 ############################// 298 MOVL a, y0; \ 299 ROLL $12, y0; \ // y0 = a <<< 12 300 MOVL e, y2; \ 301 ADDL $const, y2; \ 302 ADDL y0, y2; \ // y2 = a <<< 12 + e + T 303 VPXOR XTMP4, XTMP3, XTMP4; \ // XTMP4 = XTMP2 ^ (XTMP2 rol 15 {xxxA}) ^ (XTMP2 rol 23 {xxxA}) 304 ROLL $7, y2; \ // y2 = SS1 305 XORL y2, y0 \ // y0 = SS2 306 ADDL (disp + 2*4)(SP), y2; \ // y2 = SS1 + W 307 ADDL h, y2; \ // y2 = h + SS1 + W 308 VPXOR XTMP4, XTMP0, XTMP2; \ // XTMP2 = {..., ..., ..., W[0]} 309 ADDL (disp + 2*4 + 16)(SP), y0; \ // y0 = SS2 + W' 310 ADDL d, y0; \ // y0 = d + SS2 + W' 311 MOVL a, y1; \ 312 ORL b, y1; \ 313 VPALIGNR $4, XWORD3, XTMP2, XTMP3; \ // XTMP3 = {W[0], w15, w14, w13} 314 MOVL a, h; \ 315 ANDL b, h; \ 316 ANDL c, y1; \ 317 ORL y1, h; \ // h = (a AND b) OR (a AND c) OR (b AND c) 318 VPSLLD $15, XTMP3, XTMP4; \ 319 ADDL y0, h; \ // h = FF(a, b, c) + d + SS2 + W' = tt1 320 MOVL f, y1; \ 321 XORL g, y1; \ 322 ANDL e, y1; \ 323 VPSRLD $(32-15), XTMP3, XTMP3; \ 324 XORL g, y1; \ // y1 = GG2(e, f, g) = (e AND f) OR (NOT(e) AND g) 325 ADDL y1, y2; \ // y2 = GG2(e, f, g) + h + SS1 + W = tt2 326 ROLL $9, b; \ 327 ROLL $19, f; \ 328 VPOR XTMP3, XTMP4, XTMP4; \ // XTMP4 = (W[-3] rol 15) {DCBA} 329 P0(y2, y0, d); \ 330 VPXOR XTMP1, XTMP4, XTMP4; \ // XTMP4 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {DCBA} 331 332 #define ROUND_AND_SCHED_N_1_3(disp, const, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3) \ 333 ; \ // ############################# RND N + 3 ############################// 334 MOVL a, y0; \ 335 ROLL $12, y0; \ // y0 = a <<< 12 336 MOVL e, y2; \ 337 ADDL $const, y2; \ 338 ADDL y0, y2; \ // y2 = a <<< 12 + e + T 339 VPSLLD $15, XTMP4, XTMP2; \ 340 ROLL $7, y2; \ // y2 = SS1 341 XORL y2, y0 \ // y0 = SS2 342 ADDL (disp + 3*4)(SP), y2; \ // y2 = SS1 + W 343 ADDL h, y2; \ // y2 = h + SS1 + W 344 VPSRLD $(32-15), XTMP4, XTMP3; \ 345 ADDL (disp + 3*4 + 16)(SP), y0; \ // y0 = SS2 + W' 346 ADDL d, y0; \ // y0 = d + SS2 + W' 347 MOVL a, y1; \ 348 ORL b, y1; \ 349 VPOR XTMP3, XTMP2, XTMP3; \ // XTMP3 = XTMP4 rol 15 {DCBA} 350 MOVL a, h; \ 351 ANDL b, h; \ 352 ANDL c, y1; \ 353 ORL y1, h; \ // h = (a AND b) OR (a AND c) OR (b AND c) 354 VPSHUFB R08_SHUFFLE_MASK, XTMP3, XTMP1; \ // XTMP1 = XTMP4 rol 23 {DCBA} 355 ADDL y0, h; \ // h = FF(a, b, c) + d + SS2 + W' = tt1 356 MOVL f, y1; \ 357 XORL g, y1; \ 358 ANDL e, y1; \ 359 VPXOR XTMP3, XTMP4, XTMP3; \ // XTMP3 = XTMP4 ^ (XTMP4 rol 15 {DCBA}) 360 XORL g, y1; \ // y1 = GG2(e, f, g) = (e AND f) OR (NOT(e) AND g) 361 ADDL y1, y2; \ // y2 = GG2(e, f, g) + h + SS1 + W = tt2 362 ROLL $9, b; \ 363 ROLL $19, f; \ 364 VPXOR XTMP3, XTMP1, XTMP1; \ // XTMP1 = XTMP4 ^ (XTMP4 rol 15 {DCBA}) ^ (XTMP4 rol 23 {DCBA}) 365 P0(y2, y0, d); \ 366 VPXOR XTMP1, XTMP0, XWORD0; \ // XWORD0 = {W[3], W[2], W[1], W[0]} 367 368 // For rounds [0 - 16) 369 #define DO_ROUND_N_0(disp, idx, const, a, b, c, d, e, f, g, h) \ 370 ; \ // ############################# RND N + 0 ############################// 371 SS12(a, e, const, y2, y0); \ 372 ADDL (disp + idx*4)(SP), y2; \ // y2 = SS1 + W 373 ADDL h, y2; \ // y2 = h + SS1 + W 374 ADDL (disp + idx*4 + 16)(SP), y0; \ // y0 = SS2 + W' 375 ADDL d, y0; \ // y0 = d + SS2 + W' 376 ; \ 377 MOVL a, h; \ 378 XORL b, h; \ 379 XORL c, h; \ 380 ADDL y0, h; \ // h = FF(a, b, c) + d + SS2 + W' = tt1 381 ; \ 382 MOVL e, y1; \ 383 XORL f, y1; \ 384 XORL g, y1; \ 385 ADDL y1, y2; \ // y2 = GG(e, f, g) + h + SS1 + W = tt2 386 ; \ 387 ROLL $9, b; \ 388 ROLL $19, f; \ 389 ; \ 390 P0(y2, y0, d) 391 392 // For rounds [16 - 64) 393 #define DO_ROUND_N_1(disp, idx, const, a, b, c, d, e, f, g, h) \ 394 ; \ // ############################# RND N + 0 ############################// 395 SS12(a, e, const, y2, y0); \ 396 ADDL (disp + idx*4)(SP), y2; \ // y2 = SS1 + W 397 ADDL h, y2; \ // y2 = h + SS1 + W 398 ADDL (disp + idx*4 + 16)(SP), y0; \ // y0 = SS2 + W' 399 ADDL d, y0; \ // y0 = d + SS2 + W' 400 ; \ 401 MOVL a, y1; \ 402 ORL b, y1; \ 403 MOVL a, h; \ 404 ANDL b, h; \ 405 ANDL c, y1; \ 406 ORL y1, h; \ // h = (a AND b) OR (a AND c) OR (b AND c) 407 ADDL y0, h; \ // h = FF(a, b, c) + d + SS2 + W' = tt1 408 ; \ 409 MOVL f, y1; \ 410 XORL g, y1; \ 411 ANDL e, y1; \ 412 XORL g, y1; \ // y1 = GG2(e, f, g) 413 ADDL y1, y2; \ // y2 = GG2(e, f, g) + h + SS1 + W = tt2 414 ; \ 415 ROLL $9, b; \ 416 ROLL $19, f; \ 417 ; \ 418 P0(y2, y0, d) 419 420 // Requires: SSE2, SSSE3 421 #define MESSAGE_SCHEDULE(XWORD0, XWORD1, XWORD2, XWORD3) \ 422 MOVOU XWORD1, XTMP0; \ 423 PALIGNR $12, XWORD0, XTMP0; \ // XTMP0 = W[-13] = {w6,w5,w4,w3} 424 MOVOU XTMP0, XTMP1; \ 425 PSLLL $7, XTMP1; \ 426 PSRLL $(32-7), XTMP0; \ 427 POR XTMP0, XTMP1; \ // XTMP1 = W[-13] rol 7 428 MOVOU XWORD3, XTMP0; \ 429 PALIGNR $8, XWORD2, XTMP0; \ // XTMP0 = W[-6] = {w13,w12,w11,w10} 430 PXOR XTMP1, XTMP0; \ // XTMP0 = W[-6] XOR (W[-13] rol 7) 431 ; \ // Prepare P1 parameters 432 MOVOU XWORD2, XTMP1; \ 433 PALIGNR $12, XWORD1, XTMP1; \ // XTMP1 = W[-9] = {w10,w9,w8,w7} 434 PXOR XWORD0, XTMP1; \ // XTMP1 = W[-9] XOR W[-16] 435 PSHUFD $0xA5, XWORD3, XTMP2; \ // XTMP2 = W[-3] {BBAA} {w14,w14,w13,w13} 436 PSRLQ $17, XTMP2; \ // XTMP2 = W[-3] rol 15 {xBxA} 437 PXOR XTMP1, XTMP2; \ // XTMP2 = W[-9] XOR W[-16] XOR (W[-3] rol 15) {xxxA} 438 ; \ // P1 439 PSHUFD $0x00, XTMP2, XTMP2; \ // XTMP2 = {AAAA} 440 MOVOU XTMP2, XTMP3; \ 441 PSRLQ $17, XTMP3; \ // XTMP3 = XTMP2 rol 15 {xxxA} 442 MOVOU XTMP2, XTMP4; \ 443 PSRLQ $9, XTMP4; \ // XTMP4 = XTMP2 rol 23 {xxxA} 444 PXOR XTMP2, XTMP4; \ 445 PXOR XTMP3, XTMP4; \ // XTMP4 = XTMP2 XOR (XTMP2 rol 15 {xxxA}) XOR (XTMP2 rol 23 {xxxA}) 446 ; \ // First 1 words message schedule result 447 MOVOU XTMP0, XTMP2; \ 448 PXOR XTMP4, XTMP2; \ // XTMP2 = {..., ..., ..., W[0]} 449 ; \ // Prepare P1 parameters 450 PALIGNR $4, XWORD3, XTMP2; \ // XTMP2 = {W[0], w15, w14, w13} 451 MOVOU XTMP2, XTMP4; \ 452 PSLLL $15, XTMP4; \ 453 PSRLL $(32-15), XTMP2; \ 454 POR XTMP2, XTMP4; \ // XTMP4 = W[-3] rol 15 {DCBA} 455 PXOR XTMP1, XTMP4; \ // XTMP4 = W[-9] XOR W[-16] XOR (W[-3] rol 15) {DCBA} 456 ; \ // P1 457 MOVOU XTMP4, XTMP2; \ 458 PSLLL $15, XTMP2; \ 459 MOVOU XTMP4, XTMP3; \ 460 PSRLL $(32-15), XTMP3; \ 461 POR XTMP2, XTMP3; \ // XTMP3 = XTMP4 rol 15 {DCBA} 462 MOVOU XTMP3, XTMP1; \ 463 PSHUFB r08_mask<>(SB), XTMP1; \ // XTMP1 = XTMP4 rol 23 {DCBA} 464 PXOR XTMP4, XTMP3; \ 465 PXOR XTMP3, XTMP1; \ // XTMP1 = XTMP4 XOR (XTMP4 rol 15 {DCBA}) XOR (XTMP4 rol 23 {DCBA}) 466 ; \ // 4 words message schedule result 467 MOVOU XTMP0, XWORD0; \ 468 PXOR XTMP1, XWORD0 469 470 TEXT ·blockSIMD(SB), 0, $48-32 471 MOVQ dig+0(FP), CTX // d.h[8] 472 MOVQ p_base+8(FP), INP 473 MOVQ p_len+16(FP), NUM_BYTES 474 475 LEAQ -64(INP)(NUM_BYTES*1), NUM_BYTES // Pointer to the last block 476 MOVQ NUM_BYTES, _INP_END(SP) 477 478 // Load initial digest 479 MOVL 0(CTX), a // a = H0 480 MOVL 4(CTX), b // b = H1 481 MOVL 8(CTX), c // c = H2 482 MOVL 12(CTX), d // d = H3 483 MOVL 16(CTX), e // e = H4 484 MOVL 20(CTX), f // f = H5 485 MOVL 24(CTX), g // g = H6 486 MOVL 28(CTX), h // h = H7 487 488 CMPB ·useAVX(SB), $1 489 JE avx 490 491 MOVOU flip_mask<>(SB), X_BYTE_FLIP_MASK 492 MOVOU r08_mask<>(SB), R08_SHUFFLE_MASK 493 494 sse_loop: // at each iteration works with one block (512 bit) 495 MOVOU 0(INP), XWORD0 496 MOVOU 16(INP), XWORD1 497 MOVOU 32(INP), XWORD2 498 MOVOU 48(INP), XWORD3 499 500 PSHUFB X_BYTE_FLIP_MASK, XWORD0 // w3, w2, w1, w0 501 PSHUFB X_BYTE_FLIP_MASK, XWORD1 // w7, w6, w5, w4 502 PSHUFB X_BYTE_FLIP_MASK, XWORD2 // w11, w10, w9, w8 503 PSHUFB X_BYTE_FLIP_MASK, XWORD3 // w15, w14, w13, w12 504 505 ADDQ $64, INP 506 507 sse_schedule_compress: // for w0 - w47 508 // Do 4 rounds and scheduling 509 MOVOU XWORD0, (_XFER + 0*16)(SP) 510 MOVOU XWORD1, XFER 511 PXOR XWORD0, XFER 512 MOVOU XFER, (_XFER + 1*16)(SP) 513 DO_ROUND_N_0(_XFER, 0, T0, a, b, c, d, e, f, g, h) 514 DO_ROUND_N_0(_XFER, 1, T1, h, a, b, c, d, e, f, g) 515 MESSAGE_SCHEDULE(XWORD0, XWORD1, XWORD2, XWORD3) 516 DO_ROUND_N_0(_XFER, 2, T2, g, h, a, b, c, d, e, f) 517 DO_ROUND_N_0(_XFER, 3, T3, f, g, h, a, b, c, d, e) 518 519 // Do 4 rounds and scheduling 520 MOVOU XWORD1, (_XFER + 0*16)(SP) 521 MOVOU XWORD2, XFER 522 PXOR XWORD1, XFER 523 MOVOU XFER, (_XFER + 1*16)(SP) 524 DO_ROUND_N_0(_XFER, 0, T4, e, f, g, h, a, b, c, d) 525 DO_ROUND_N_0(_XFER, 1, T5, d, e, f, g, h, a, b, c) 526 MESSAGE_SCHEDULE(XWORD1, XWORD2, XWORD3, XWORD0) 527 DO_ROUND_N_0(_XFER, 2, T6, c, d, e, f, g, h, a, b) 528 DO_ROUND_N_0(_XFER, 3, T7, b, c, d, e, f, g, h, a) 529 530 // Do 4 rounds and scheduling 531 MOVOU XWORD2, (_XFER + 0*16)(SP) 532 MOVOU XWORD3, XFER 533 PXOR XWORD2, XFER 534 MOVOU XFER, (_XFER + 1*16)(SP) 535 DO_ROUND_N_0(_XFER, 0, T8, a, b, c, d, e, f, g, h) 536 DO_ROUND_N_0(_XFER, 1, T9, h, a, b, c, d, e, f, g) 537 MESSAGE_SCHEDULE(XWORD2, XWORD3, XWORD0, XWORD1) 538 DO_ROUND_N_0(_XFER, 2, T10, g, h, a, b, c, d, e, f) 539 DO_ROUND_N_0(_XFER, 3, T11, f, g, h, a, b, c, d, e) 540 541 // Do 4 rounds and scheduling 542 MOVOU XWORD3, (_XFER + 0*16)(SP) 543 MOVOU XWORD0, XFER 544 PXOR XWORD3, XFER 545 MOVOU XFER, (_XFER + 1*16)(SP) 546 DO_ROUND_N_0(_XFER, 0, T12, e, f, g, h, a, b, c, d) 547 DO_ROUND_N_0(_XFER, 1, T13, d, e, f, g, h, a, b, c) 548 MESSAGE_SCHEDULE(XWORD3, XWORD0, XWORD1, XWORD2) 549 DO_ROUND_N_0(_XFER, 2, T14, c, d, e, f, g, h, a, b) 550 DO_ROUND_N_0(_XFER, 3, T15, b, c, d, e, f, g, h, a) 551 552 // Do 4 rounds and scheduling 553 MOVOU XWORD0, (_XFER + 0*16)(SP) 554 MOVOU XWORD1, XFER 555 PXOR XWORD0, XFER 556 MOVOU XFER, (_XFER + 1*16)(SP) 557 DO_ROUND_N_1(_XFER, 0, T16, a, b, c, d, e, f, g, h) 558 DO_ROUND_N_1(_XFER, 1, T17, h, a, b, c, d, e, f, g) 559 MESSAGE_SCHEDULE(XWORD0, XWORD1, XWORD2, XWORD3) 560 DO_ROUND_N_1(_XFER, 2, T18, g, h, a, b, c, d, e, f) 561 DO_ROUND_N_1(_XFER, 3, T19, f, g, h, a, b, c, d, e) 562 563 // Do 4 rounds and scheduling 564 MOVOU XWORD1, (_XFER + 0*16)(SP) 565 MOVOU XWORD2, XFER 566 PXOR XWORD1, XFER 567 MOVOU XFER, (_XFER + 1*16)(SP) 568 DO_ROUND_N_1(_XFER, 0, T20, e, f, g, h, a, b, c, d) 569 DO_ROUND_N_1(_XFER, 1, T21, d, e, f, g, h, a, b, c) 570 MESSAGE_SCHEDULE(XWORD1, XWORD2, XWORD3, XWORD0) 571 DO_ROUND_N_1(_XFER, 2, T22, c, d, e, f, g, h, a, b) 572 DO_ROUND_N_1(_XFER, 3, T23, b, c, d, e, f, g, h, a) 573 574 // Do 4 rounds and scheduling 575 MOVOU XWORD2, (_XFER + 0*16)(SP) 576 MOVOU XWORD3, XFER 577 PXOR XWORD2, XFER 578 MOVOU XFER, (_XFER + 1*16)(SP) 579 DO_ROUND_N_1(_XFER, 0, T24, a, b, c, d, e, f, g, h) 580 DO_ROUND_N_1(_XFER, 1, T25, h, a, b, c, d, e, f, g) 581 MESSAGE_SCHEDULE(XWORD2, XWORD3, XWORD0, XWORD1) 582 DO_ROUND_N_1(_XFER, 2, T26, g, h, a, b, c, d, e, f) 583 DO_ROUND_N_1(_XFER, 3, T27, f, g, h, a, b, c, d, e) 584 585 // Do 4 rounds and scheduling 586 MOVOU XWORD3, (_XFER + 0*16)(SP) 587 MOVOU XWORD0, XFER 588 PXOR XWORD3, XFER 589 MOVOU XFER, (_XFER + 1*16)(SP) 590 DO_ROUND_N_1(_XFER, 0, T28, e, f, g, h, a, b, c, d) 591 DO_ROUND_N_1(_XFER, 1, T29, d, e, f, g, h, a, b, c) 592 MESSAGE_SCHEDULE(XWORD3, XWORD0, XWORD1, XWORD2) 593 DO_ROUND_N_1(_XFER, 2, T30, c, d, e, f, g, h, a, b) 594 DO_ROUND_N_1(_XFER, 3, T31, b, c, d, e, f, g, h, a) 595 596 // Do 4 rounds and scheduling 597 MOVOU XWORD0, (_XFER + 0*16)(SP) 598 MOVOU XWORD1, XFER 599 PXOR XWORD0, XFER 600 MOVOU XFER, (_XFER + 1*16)(SP) 601 DO_ROUND_N_1(_XFER, 0, T32, a, b, c, d, e, f, g, h) 602 DO_ROUND_N_1(_XFER, 1, T33, h, a, b, c, d, e, f, g) 603 MESSAGE_SCHEDULE(XWORD0, XWORD1, XWORD2, XWORD3) 604 DO_ROUND_N_1(_XFER, 2, T34, g, h, a, b, c, d, e, f) 605 DO_ROUND_N_1(_XFER, 3, T35, f, g, h, a, b, c, d, e) 606 607 // Do 4 rounds and scheduling 608 MOVOU XWORD1, (_XFER + 0*16)(SP) 609 MOVOU XWORD2, XFER 610 PXOR XWORD1, XFER 611 MOVOU XFER, (_XFER + 1*16)(SP) 612 DO_ROUND_N_1(_XFER, 0, T36, e, f, g, h, a, b, c, d) 613 DO_ROUND_N_1(_XFER, 1, T37, d, e, f, g, h, a, b, c) 614 MESSAGE_SCHEDULE(XWORD1, XWORD2, XWORD3, XWORD0) 615 DO_ROUND_N_1(_XFER, 2, T38, c, d, e, f, g, h, a, b) 616 DO_ROUND_N_1(_XFER, 3, T39, b, c, d, e, f, g, h, a) 617 618 // Do 4 rounds and scheduling 619 MOVOU XWORD2, (_XFER + 0*16)(SP) 620 MOVOU XWORD3, XFER 621 PXOR XWORD2, XFER 622 MOVOU XFER, (_XFER + 1*16)(SP) 623 DO_ROUND_N_1(_XFER, 0, T40, a, b, c, d, e, f, g, h) 624 DO_ROUND_N_1(_XFER, 1, T41, h, a, b, c, d, e, f, g) 625 MESSAGE_SCHEDULE(XWORD2, XWORD3, XWORD0, XWORD1) 626 DO_ROUND_N_1(_XFER, 2, T42, g, h, a, b, c, d, e, f) 627 DO_ROUND_N_1(_XFER, 3, T43, f, g, h, a, b, c, d, e) 628 629 // Do 4 rounds and scheduling 630 MOVOU XWORD3, (_XFER + 0*16)(SP) 631 MOVOU XWORD0, XFER 632 PXOR XWORD3, XFER 633 MOVOU XFER, (_XFER + 1*16)(SP) 634 DO_ROUND_N_1(_XFER, 0, T44, e, f, g, h, a, b, c, d) 635 DO_ROUND_N_1(_XFER, 1, T45, d, e, f, g, h, a, b, c) 636 MESSAGE_SCHEDULE(XWORD3, XWORD0, XWORD1, XWORD2) 637 DO_ROUND_N_1(_XFER, 2, T46, c, d, e, f, g, h, a, b) 638 DO_ROUND_N_1(_XFER, 3, T47, b, c, d, e, f, g, h, a) 639 640 // w48 - w63 processed with only 4 rounds scheduling (last 16 rounds) 641 // Do 4 rounds 642 MOVOU XWORD0, (_XFER + 0*16)(SP) 643 MOVOU XWORD1, XFER 644 PXOR XWORD0, XFER 645 MOVOU XFER, (_XFER + 1*16)(SP) 646 DO_ROUND_N_1(_XFER, 0, T48, a, b, c, d, e, f, g, h) 647 DO_ROUND_N_1(_XFER, 1, T49, h, a, b, c, d, e, f, g) 648 DO_ROUND_N_1(_XFER, 2, T50, g, h, a, b, c, d, e, f) 649 DO_ROUND_N_1(_XFER, 3, T51, f, g, h, a, b, c, d, e) 650 651 // Do 4 rounds 652 MOVOU XWORD1, (_XFER + 0*16)(SP) 653 MOVOU XWORD2, XFER 654 PXOR XWORD1, XFER 655 MOVOU XFER, (_XFER + 1*16)(SP) 656 DO_ROUND_N_1(_XFER, 0, T52, e, f, g, h, a, b, c, d) 657 DO_ROUND_N_1(_XFER, 1, T53, d, e, f, g, h, a, b, c) 658 DO_ROUND_N_1(_XFER, 2, T54, c, d, e, f, g, h, a, b) 659 DO_ROUND_N_1(_XFER, 3, T55, b, c, d, e, f, g, h, a) 660 661 // Do 4 rounds 662 MOVOU XWORD2, (_XFER + 0*16)(SP) 663 MOVOU XWORD3, XFER 664 PXOR XWORD2, XFER 665 MOVOU XFER, (_XFER + 1*16)(SP) 666 MESSAGE_SCHEDULE(XWORD0, XWORD1, XWORD2, XWORD3) 667 DO_ROUND_N_1(_XFER, 0, T56, a, b, c, d, e, f, g, h) 668 DO_ROUND_N_1(_XFER, 1, T57, h, a, b, c, d, e, f, g) 669 DO_ROUND_N_1(_XFER, 2, T58, g, h, a, b, c, d, e, f) 670 DO_ROUND_N_1(_XFER, 3, T59, f, g, h, a, b, c, d, e) 671 672 // Do 4 rounds 673 MOVOU XWORD3, (_XFER + 0*16)(SP) 674 MOVOU XWORD0, XFER 675 PXOR XWORD3, XFER 676 MOVOU XFER, (_XFER + 1*16)(SP) 677 DO_ROUND_N_1(_XFER, 0, T60, e, f, g, h, a, b, c, d) 678 DO_ROUND_N_1(_XFER, 1, T61, d, e, f, g, h, a, b, c) 679 DO_ROUND_N_1(_XFER, 2, T62, c, d, e, f, g, h, a, b) 680 DO_ROUND_N_1(_XFER, 3, T63, b, c, d, e, f, g, h, a) 681 682 xorm( 0(CTX), a) 683 xorm( 4(CTX), b) 684 xorm( 8(CTX), c) 685 xorm( 12(CTX), d) 686 xorm( 16(CTX), e) 687 xorm( 20(CTX), f) 688 xorm( 24(CTX), g) 689 xorm( 28(CTX), h) 690 691 CMPQ _INP_END(SP), INP 692 JAE sse_loop 693 694 sse_done_hash: 695 RET 696 697 avx: 698 VMOVDQU flip_mask<>(SB), X_BYTE_FLIP_MASK 699 VMOVDQU r08_mask<>(SB), R08_SHUFFLE_MASK 700 701 avx_loop: // at each iteration works with one block (512 bit) 702 703 VMOVDQU 0(INP), XWORD0 704 VMOVDQU 16(INP), XWORD1 705 VMOVDQU 32(INP), XWORD2 706 VMOVDQU 48(INP), XWORD3 707 708 VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0 // w3, w2, w1, w0 709 VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1 // w7, w6, w5, w4 710 VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2 // w11, w10, w9, w8 711 VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3 // w15, w14, w13, w12 712 713 ADDQ $64, INP 714 715 avx_schedule_compress: // for w0 - w47 716 // Do 4 rounds and scheduling 717 VMOVDQU XWORD0, (_XFER + 0*16)(SP) 718 VPXOR XWORD0, XWORD1, XFER 719 VMOVDQU XFER, (_XFER + 1*16)(SP) 720 ROUND_AND_SCHED_N_0_0(_XFER, T0, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3) 721 ROUND_AND_SCHED_N_0_1(_XFER, T1, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3) 722 ROUND_AND_SCHED_N_0_2(_XFER, T2, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3) 723 ROUND_AND_SCHED_N_0_3(_XFER, T3, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3) 724 725 // Do 4 rounds and scheduling 726 VMOVDQU XWORD1, (_XFER + 0*16)(SP) 727 VPXOR XWORD1, XWORD2, XFER 728 VMOVDQU XFER, (_XFER + 1*16)(SP) 729 ROUND_AND_SCHED_N_0_0(_XFER, T4, e, f, g, h, a, b, c, d, XWORD1, XWORD2, XWORD3, XWORD0) 730 ROUND_AND_SCHED_N_0_1(_XFER, T5, d, e, f, g, h, a, b, c, XWORD1, XWORD2, XWORD3, XWORD0) 731 ROUND_AND_SCHED_N_0_2(_XFER, T6, c, d, e, f, g, h, a, b, XWORD1, XWORD2, XWORD3, XWORD0) 732 ROUND_AND_SCHED_N_0_3(_XFER, T7, b, c, d, e, f, g, h, a, XWORD1, XWORD2, XWORD3, XWORD0) 733 734 // Do 4 rounds and scheduling 735 VMOVDQU XWORD2, (_XFER + 0*16)(SP) 736 VPXOR XWORD2, XWORD3, XFER 737 VMOVDQU XFER, (_XFER + 1*16)(SP) 738 ROUND_AND_SCHED_N_0_0(_XFER, T8, a, b, c, d, e, f, g, h, XWORD2, XWORD3, XWORD0, XWORD1) 739 ROUND_AND_SCHED_N_0_1(_XFER, T9, h, a, b, c, d, e, f, g, XWORD2, XWORD3, XWORD0, XWORD1) 740 ROUND_AND_SCHED_N_0_2(_XFER, T10, g, h, a, b, c, d, e, f, XWORD2, XWORD3, XWORD0, XWORD1) 741 ROUND_AND_SCHED_N_0_3(_XFER, T11, f, g, h, a, b, c, d, e, XWORD2, XWORD3, XWORD0, XWORD1) 742 743 // Do 4 rounds and scheduling 744 VMOVDQU XWORD3, (_XFER + 0*16)(SP) 745 VPXOR XWORD3, XWORD0, XFER 746 VMOVDQU XFER, (_XFER + 1*16)(SP) 747 ROUND_AND_SCHED_N_0_0(_XFER, T12, e, f, g, h, a, b, c, d, XWORD3, XWORD0, XWORD1, XWORD2) 748 ROUND_AND_SCHED_N_0_1(_XFER, T13, d, e, f, g, h, a, b, c, XWORD3, XWORD0, XWORD1, XWORD2) 749 ROUND_AND_SCHED_N_0_2(_XFER, T14, c, d, e, f, g, h, a, b, XWORD3, XWORD0, XWORD1, XWORD2) 750 ROUND_AND_SCHED_N_0_3(_XFER, T15, b, c, d, e, f, g, h, a, XWORD3, XWORD0, XWORD1, XWORD2) 751 752 // Do 4 rounds and scheduling 753 VMOVDQU XWORD0, (_XFER + 0*16)(SP) 754 VPXOR XWORD0, XWORD1, XFER 755 VMOVDQU XFER, (_XFER + 1*16)(SP) 756 ROUND_AND_SCHED_N_1_0(_XFER, T16, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3) 757 ROUND_AND_SCHED_N_1_1(_XFER, T17, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3) 758 ROUND_AND_SCHED_N_1_2(_XFER, T18, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3) 759 ROUND_AND_SCHED_N_1_3(_XFER, T19, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3) 760 761 // Do 4 rounds and scheduling 762 VMOVDQU XWORD1, (_XFER + 0*16)(SP) 763 VPXOR XWORD1, XWORD2, XFER 764 VMOVDQU XFER, (_XFER + 1*16)(SP) 765 ROUND_AND_SCHED_N_1_0(_XFER, T20, e, f, g, h, a, b, c, d, XWORD1, XWORD2, XWORD3, XWORD0) 766 ROUND_AND_SCHED_N_1_1(_XFER, T21, d, e, f, g, h, a, b, c, XWORD1, XWORD2, XWORD3, XWORD0) 767 ROUND_AND_SCHED_N_1_2(_XFER, T22, c, d, e, f, g, h, a, b, XWORD1, XWORD2, XWORD3, XWORD0) 768 ROUND_AND_SCHED_N_1_3(_XFER, T23, b, c, d, e, f, g, h, a, XWORD1, XWORD2, XWORD3, XWORD0) 769 770 // Do 4 rounds and scheduling 771 VMOVDQU XWORD2, (_XFER + 0*16)(SP) 772 VPXOR XWORD2, XWORD3, XFER 773 VMOVDQU XFER, (_XFER + 1*16)(SP) 774 775 ROUND_AND_SCHED_N_1_0(_XFER, T24, a, b, c, d, e, f, g, h, XWORD2, XWORD3, XWORD0, XWORD1) 776 ROUND_AND_SCHED_N_1_1(_XFER, T25, h, a, b, c, d, e, f, g, XWORD2, XWORD3, XWORD0, XWORD1) 777 ROUND_AND_SCHED_N_1_2(_XFER, T26, g, h, a, b, c, d, e, f, XWORD2, XWORD3, XWORD0, XWORD1) 778 ROUND_AND_SCHED_N_1_3(_XFER, T27, f, g, h, a, b, c, d, e, XWORD2, XWORD3, XWORD0, XWORD1) 779 780 // Do 4 rounds and scheduling 781 VMOVDQU XWORD3, (_XFER + 0*16)(SP) 782 VPXOR XWORD3, XWORD0, XFER 783 VMOVDQU XFER, (_XFER + 1*16)(SP) 784 ROUND_AND_SCHED_N_1_0(_XFER, T28, e, f, g, h, a, b, c, d, XWORD3, XWORD0, XWORD1, XWORD2) 785 ROUND_AND_SCHED_N_1_1(_XFER, T29, d, e, f, g, h, a, b, c, XWORD3, XWORD0, XWORD1, XWORD2) 786 ROUND_AND_SCHED_N_1_2(_XFER, T30, c, d, e, f, g, h, a, b, XWORD3, XWORD0, XWORD1, XWORD2) 787 ROUND_AND_SCHED_N_1_3(_XFER, T31, b, c, d, e, f, g, h, a, XWORD3, XWORD0, XWORD1, XWORD2) 788 789 // Do 4 rounds and scheduling 790 VMOVDQU XWORD0, (_XFER + 0*16)(SP) 791 VPXOR XWORD0, XWORD1, XFER 792 VMOVDQU XFER, (_XFER + 1*16)(SP) 793 ROUND_AND_SCHED_N_1_0(_XFER, T32, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3) 794 ROUND_AND_SCHED_N_1_1(_XFER, T33, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3) 795 ROUND_AND_SCHED_N_1_2(_XFER, T34, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3) 796 ROUND_AND_SCHED_N_1_3(_XFER, T35, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3) 797 798 // Do 4 rounds and scheduling 799 VMOVDQU XWORD1, (_XFER + 0*16)(SP) 800 VPXOR XWORD1, XWORD2, XFER 801 VMOVDQU XFER, (_XFER + 1*16)(SP) 802 ROUND_AND_SCHED_N_1_0(_XFER, T36, e, f, g, h, a, b, c, d, XWORD1, XWORD2, XWORD3, XWORD0) 803 ROUND_AND_SCHED_N_1_1(_XFER, T37, d, e, f, g, h, a, b, c, XWORD1, XWORD2, XWORD3, XWORD0) 804 ROUND_AND_SCHED_N_1_2(_XFER, T38, c, d, e, f, g, h, a, b, XWORD1, XWORD2, XWORD3, XWORD0) 805 ROUND_AND_SCHED_N_1_3(_XFER, T39, b, c, d, e, f, g, h, a, XWORD1, XWORD2, XWORD3, XWORD0) 806 807 // Do 4 rounds and scheduling 808 VMOVDQU XWORD2, (_XFER + 0*16)(SP) 809 VPXOR XWORD2, XWORD3, XFER 810 VMOVDQU XFER, (_XFER + 1*16)(SP) 811 ROUND_AND_SCHED_N_1_0(_XFER, T40, a, b, c, d, e, f, g, h, XWORD2, XWORD3, XWORD0, XWORD1) 812 ROUND_AND_SCHED_N_1_1(_XFER, T41, h, a, b, c, d, e, f, g, XWORD2, XWORD3, XWORD0, XWORD1) 813 ROUND_AND_SCHED_N_1_2(_XFER, T42, g, h, a, b, c, d, e, f, XWORD2, XWORD3, XWORD0, XWORD1) 814 ROUND_AND_SCHED_N_1_3(_XFER, T43, f, g, h, a, b, c, d, e, XWORD2, XWORD3, XWORD0, XWORD1) 815 816 // Do 4 rounds and scheduling 817 VMOVDQU XWORD3, (_XFER + 0*16)(SP) 818 VPXOR XWORD3, XWORD0, XFER 819 VMOVDQU XFER, (_XFER + 1*16)(SP) 820 ROUND_AND_SCHED_N_1_0(_XFER, T44, e, f, g, h, a, b, c, d, XWORD3, XWORD0, XWORD1, XWORD2) 821 ROUND_AND_SCHED_N_1_1(_XFER, T45, d, e, f, g, h, a, b, c, XWORD3, XWORD0, XWORD1, XWORD2) 822 ROUND_AND_SCHED_N_1_2(_XFER, T46, c, d, e, f, g, h, a, b, XWORD3, XWORD0, XWORD1, XWORD2) 823 ROUND_AND_SCHED_N_1_3(_XFER, T47, b, c, d, e, f, g, h, a, XWORD3, XWORD0, XWORD1, XWORD2) 824 825 // w48 - w63 processed with only 4 rounds scheduling (last 16 rounds) 826 // Do 4 rounds and scheduling 827 VMOVDQU XWORD0, (_XFER + 0*16)(SP) 828 VPXOR XWORD0, XWORD1, XFER 829 VMOVDQU XFER, (_XFER + 1*16)(SP) 830 ROUND_AND_SCHED_N_1_0(_XFER, T48, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3) 831 ROUND_AND_SCHED_N_1_1(_XFER, T49, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3) 832 ROUND_AND_SCHED_N_1_2(_XFER, T50, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3) 833 ROUND_AND_SCHED_N_1_3(_XFER, T51, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3) 834 835 // w52 - w63 processed with no scheduling (last 12 rounds) 836 // Do 4 rounds 837 VMOVDQU XWORD1, (_XFER + 0*16)(SP) 838 VPXOR XWORD1, XWORD2, XFER 839 VMOVDQU XFER, (_XFER + 1*16)(SP) 840 DO_ROUND_N_1(_XFER, 0, T52, e, f, g, h, a, b, c, d) 841 DO_ROUND_N_1(_XFER, 1, T53, d, e, f, g, h, a, b, c) 842 DO_ROUND_N_1(_XFER, 2, T54, c, d, e, f, g, h, a, b) 843 DO_ROUND_N_1(_XFER, 3, T55, b, c, d, e, f, g, h, a) 844 845 // Do 4 rounds 846 VMOVDQU XWORD2, (_XFER + 0*16)(SP) 847 VPXOR XWORD2, XWORD3, XFER 848 VMOVDQU XFER, (_XFER + 1*16)(SP) 849 DO_ROUND_N_1(_XFER, 0, T56, a, b, c, d, e, f, g, h) 850 DO_ROUND_N_1(_XFER, 1, T57, h, a, b, c, d, e, f, g) 851 DO_ROUND_N_1(_XFER, 2, T58, g, h, a, b, c, d, e, f) 852 DO_ROUND_N_1(_XFER, 3, T59, f, g, h, a, b, c, d, e) 853 854 // Do 4 rounds 855 VMOVDQU XWORD3, (_XFER + 0*16)(SP) 856 VPXOR XWORD3, XWORD0, XFER 857 VMOVDQU XFER, (_XFER + 1*16)(SP) 858 DO_ROUND_N_1(_XFER, 0, T60, e, f, g, h, a, b, c, d) 859 DO_ROUND_N_1(_XFER, 1, T61, d, e, f, g, h, a, b, c) 860 DO_ROUND_N_1(_XFER, 2, T62, c, d, e, f, g, h, a, b) 861 DO_ROUND_N_1(_XFER, 3, T63, b, c, d, e, f, g, h, a) 862 863 xorm( 0(CTX), a) 864 xorm( 4(CTX), b) 865 xorm( 8(CTX), c) 866 xorm( 12(CTX), d) 867 xorm( 16(CTX), e) 868 xorm( 20(CTX), f) 869 xorm( 24(CTX), g) 870 xorm( 28(CTX), h) 871 872 CMPQ _INP_END(SP), INP 873 JAE avx_loop 874 875 done_hash: 876 RET 877 878 // shuffle byte order from LE to BE 879 DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203 880 DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b 881 GLOBL flip_mask<>(SB), 8, $16 882 883 DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003 884 DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B 885 GLOBL r08_mask<>(SB), 8, $16