github.com/emmansun/gmsm@v0.29.1/sm3/sm3block_avx2_amd64.s (about) 1 //go:build !purego 2 3 #include "textflag.h" 4 5 #include "sm3_const_asm.s" 6 7 // Definitions for AVX2 version 8 9 // xorm (mem), reg 10 // Xor reg to mem using reg-mem xor and store 11 #define xorm(P1, P2) \ 12 XORL P2, P1; \ 13 MOVL P1, P2 14 15 #define XDWORD0 Y4 16 #define XDWORD1 Y5 17 #define XDWORD2 Y6 18 #define XDWORD3 Y7 19 20 #define XWORD0 X4 21 #define XWORD1 X5 22 #define XWORD2 X6 23 #define XWORD3 X7 24 25 #define XTMP0 Y0 26 #define XTMP1 Y1 27 #define XTMP2 Y2 28 #define XTMP3 Y3 29 #define XTMP4 Y8 30 31 #define XFER Y9 32 #define R08_SHUFFLE_MASK Y10 33 34 #define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE 35 #define X_BYTE_FLIP_MASK X13 36 37 #define NUM_BYTES DX 38 #define INP DI 39 40 #define CTX SI // Beginning of digest in memory (a, b, c, ... , h) 41 42 #define a AX 43 #define b BX 44 #define c CX 45 #define d DX 46 #define e R8 47 #define f R9 48 #define g R10 49 #define h R11 50 51 #define y0 R12 52 #define y1 R13 53 #define y2 R14 54 55 // Offsets 56 #define XFER_SIZE 4*64*4 57 #define INP_END_SIZE 8 58 59 #define _XFER 0 60 #define _INP_END _XFER + XFER_SIZE 61 #define STACK_SIZE _INP_END + INP_END_SIZE 62 63 #define P0(tt2, tmp, out) \ 64 RORXL $23, tt2, tmp; \ 65 RORXL $15, tt2, out; \ 66 XORL tmp, out; \ 67 XORL tt2, out 68 69 // For rounds [0 - 16) 70 #define ROUND_AND_SCHED_N_0_0(disp, const, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 71 ; \ // ############################# RND N + 0 ############################// 72 RORXL $20, a, y0; \ // y0 = a <<< 12, RORXL is BMI2 instr 73 MOVL e, y2; \ 74 ADDL $const, y2; \ 75 VPALIGNR $12, XDWORD0, XDWORD1, XTMP0; \ // XTMP0 = W[-13] = {w6,w5,w4,w3} 76 ADDL y0, y2; \ // y2 = a <<< 12 + e + T 77 ROLL $7, y2; \ // y2 = SS1 78 XORL y2, y0 \ // y0 = SS2 79 VPSLLD $7, XTMP0, XTMP1; \ // XTMP1 = W[-13] << 7 = {w6<<7,w5<<7,w4<<7,w3<<7} 80 ADDL (disp + 0*4)(SP), y2; \ // y2 = SS1 + W 81 ADDL h, y2; \ // y2 = h + SS1 + W 82 ADDL (disp + 0*4 + 32)(SP), y0; \ // y0 = SS2 + W' 83 VPSRLD $(32-7), XTMP0, XTMP0; \ // XTMP0 = W[-13] >> 25 = {w6>>25,w5>>25,w4>>25,w3>>25} 84 ADDL d, y0; \ // y0 = d + SS2 + W' 85 MOVL a, h; \ 86 XORL b, h; \ 87 VPOR XTMP0, XTMP1, XTMP1; \ // XTMP1 = W[-13] rol 7 88 XORL c, h; \ 89 ADDL y0, h; \ // h = FF(a, b, c) + d + SS2 + W' = tt1 90 MOVL e, y1; \ 91 VPALIGNR $8, XDWORD2, XDWORD3, XTMP0; \ // XTMP0 = W[-6] = {w13,w12,w11,w10} 92 XORL f, y1; \ 93 XORL g, y1; \ 94 ADDL y1, y2; \ // y2 = GG(e, f, g) + h + SS1 + W = tt2 95 VPXOR XTMP1, XTMP0, XTMP0; \ // XTMP0 = W[-6] ^ (W[-13] rol 7) 96 ROLL $9, b; \ 97 ROLL $19, f; \ 98 VPALIGNR $12, XDWORD1, XDWORD2, XTMP1; \ // XTMP1 = W[-9] = {w10,w9,w8,w7} 99 P0(y2, y0, d); \ 100 VPXOR XDWORD0, XTMP1, XTMP1; \ // XTMP1 = W[-9] ^ W[-16] 101 102 #define ROUND_AND_SCHED_N_0_1(disp, const, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 103 ; \ // ############################# RND N + 1 ############################// 104 RORXL $20, a, y0; \ // y0 = a <<< 12 105 MOVL e, y2; \ 106 ADDL $const, y2; \ 107 VPSHUFD $0xA5, XDWORD3, XTMP2; \ // XTMP2 = W[-3] {BBAA} {w14,w14,w13,w13} 108 ADDL y0, y2; \ // y2 = a <<< 12 + e + T 109 ROLL $7, y2; \ // y2 = SS1 110 XORL y2, y0 \ // y0 = SS2 111 VPSRLQ $17, XTMP2, XTMP2; \ // XTMP2 = W[-3] rol 15 {xBxA} 112 ADDL (disp + 1*4)(SP), y2; \ // y2 = SS1 + W 113 ADDL h, y2; \ // y2 = h + SS1 + W 114 ADDL (disp + 1*4 + 32)(SP), y0; \ // y0 = SS2 + W' 115 VPXOR XTMP1, XTMP2, XTMP2; \ // XTMP2 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {xxxA} 116 ADDL d, y0; \ // y0 = d + SS2 + W' 117 MOVL a, h; \ 118 XORL b, h; \ 119 VPSHUFD $0x00, XTMP2, XTMP2; \ // XTMP2 = {AAAA} 120 XORL c, h; \ 121 ADDL y0, h; \ // h = FF(a, b, c) + d + SS2 + W' = tt1 122 MOVL e, y1; \ 123 XORL f, y1; \ 124 VPSRLQ $17, XTMP2, XTMP3; \ // XTMP3 = XTMP2 rol 15 {xxxA} 125 XORL g, y1; \ 126 ADDL y1, y2; \ // y2 = GG(e, f, g) + h + SS1 + W = tt2 127 ROLL $9, b; \ 128 ROLL $19, f; \ 129 VPSRLQ $9, XTMP2, XTMP4; \ // XTMP4 = XTMP2 rol 23 {xxxA} 130 P0(y2, y0, d); \ 131 VPXOR XTMP2, XTMP4, XTMP4; \ // XTMP4 = XTMP2 ^ (XTMP2 rol 23 {xxxA}) 132 133 #define ROUND_AND_SCHED_N_0_2(disp, const, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 134 ; \ // ############################# RND N + 2 ############################// 135 RORXL $20, a, y0; \ // y0 = a <<< 12 136 MOVL e, y2; \ 137 ADDL $const, y2; \ 138 ADDL y0, y2; \ // y2 = a <<< 12 + e + T 139 VPXOR XTMP4, XTMP3, XTMP4; \ // XTMP4 = XTMP2 ^ (XTMP2 rol 15 {xxxA}) ^ (XTMP2 rol 23 {xxxA}) 140 ROLL $7, y2; \ // y2 = SS1 141 XORL y2, y0 \ // y0 = SS2 142 ADDL (disp + 2*4)(SP), y2; \ // y2 = SS1 + W 143 VPXOR XTMP4, XTMP0, XTMP2; \ // XTMP2 = {..., ..., ..., W[0]} 144 ADDL h, y2; \ // y2 = h + SS1 + W 145 ADDL (disp + 2*4 + 32)(SP), y0; \ // y0 = SS2 + W' 146 ADDL d, y0; \ // y0 = d + SS2 + W' 147 VPALIGNR $4, XDWORD3, XTMP2, XTMP3; \ // XTMP3 = {W[0], w15, w14, w13} 148 MOVL a, h; \ 149 XORL b, h; \ 150 XORL c, h; \ 151 VPSLLD $15, XTMP3, XTMP4; \ 152 ADDL y0, h; \ // h = FF(a, b, c) + d + SS2 + W' = tt1 153 MOVL e, y1; \ 154 XORL f, y1; \ 155 XORL g, y1; \ 156 VPSRLD $(32-15), XTMP3, XTMP3; \ 157 ADDL y1, y2; \ // y2 = GG(e, f, g) + h + SS1 + W = tt2 158 ROLL $9, b; \ 159 ROLL $19, f; \ 160 VPOR XTMP3, XTMP4, XTMP4; \ // XTMP4 = (W[-3] rol 15) {DCBA} 161 P0(y2, y0, d); \ 162 VPXOR XTMP1, XTMP4, XTMP4; \ // XTMP4 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {DCBA} 163 164 #define ROUND_AND_SCHED_N_0_3(disp, const, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 165 ; \ // ############################# RND N + 3 ############################// 166 RORXL $20, a, y0; \ // y0 = a <<< 12 167 MOVL e, y2; \ 168 ADDL $const, y2; \ 169 VPSLLD $15, XTMP4, XTMP2; \ 170 ADDL y0, y2; \ // y2 = a <<< 12 + e + T 171 ROLL $7, y2; \ // y2 = SS1 172 XORL y2, y0 \ // y0 = SS2 173 VPSRLD $(32-15), XTMP4, XTMP3; \ 174 ADDL (disp + 3*4)(SP), y2; \ // y2 = SS1 + W 175 ADDL h, y2; \ // y2 = h + SS1 + W 176 ADDL (disp + 3*4 + 32)(SP), y0; \ // y2 = SS2 + W' 177 ADDL d, y0; \ // y0 = d + SS2 + W' 178 VPOR XTMP3, XTMP2, XTMP3; \ // XTMP3 = XTMP4 rol 15 {DCBA} 179 MOVL a, h; \ 180 XORL b, h; \ 181 XORL c, h; \ 182 VPSHUFB R08_SHUFFLE_MASK, XTMP3, XTMP1; \ // XTMP1 = XTMP4 rol 23 {DCBA} 183 ADDL y0, h; \ // h = FF(a, b, c) + d + SS2 + W' = tt1 184 MOVL e, y1; \ 185 XORL f, y1; \ 186 XORL g, y1; \ 187 VPXOR XTMP3, XTMP4, XTMP3; \ // XTMP3 = XTMP4 ^ (XTMP4 rol 15 {DCBA}) 188 ADDL y1, y2; \ // y2 = GG(e, f, g) + h + SS1 + W = tt2 189 ROLL $9, b; \ 190 ROLL $19, f; \ 191 VPXOR XTMP3, XTMP1, XTMP1; \ // XTMP1 = XTMP4 ^ (XTMP4 rol 15 {DCBA}) ^ (XTMP4 rol 23 {DCBA}) 192 P0(y2, y0, d); \ 193 VPXOR XTMP1, XTMP0, XDWORD0; \ // XDWORD0 = {W[3], W[2], W[1], W[0]} 194 195 // For rounds [16 - 64) 196 #define ROUND_AND_SCHED_N_1_0(disp, const, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 197 ; \ // ############################# RND N + 0 ############################// 198 RORXL $20, a, y0; \ // y0 = a <<< 12 199 MOVL e, y2; \ 200 ADDL $const, y2; \ 201 VPALIGNR $12, XDWORD0, XDWORD1, XTMP0; \ // XTMP0 = W[-13] = {w6,w5,w4,w3} 202 ADDL y0, y2; \ // y2 = a <<< 12 + e + T 203 ROLL $7, y2; \ // y2 = SS1 204 XORL y2, y0 \ // y0 = SS2 205 VPSLLD $7, XTMP0, XTMP1; \ // XTMP1 = W[-13] << 7 = {w6<<7,w5<<7,w4<<7,w3<<7} 206 ADDL (disp + 0*4)(SP), y2; \ // y2 = SS1 + W 207 ADDL h, y2; \ // y2 = h + SS1 + W 208 ADDL (disp + 0*4 + 32)(SP), y0; \ // y0 = SS2 + W' 209 VPSRLD $(32-7), XTMP0, XTMP0; \ // XTMP0 = W[-13] >> 25 = {w6>>25,w5>>25,w4>>25,w3>>25} 210 ADDL d, y0; \ // y0 = d + SS2 + W' 211 MOVL a, y1; \ 212 ORL b, y1; \ 213 VPOR XTMP0, XTMP1, XTMP1; \ // XTMP1 = W[-13] rol 7 = {ROTL(7,w6),ROTL(7,w5),ROTL(7,w4),ROTL(7,w3)} 214 MOVL a, h; \ 215 ANDL b, h; \ 216 ANDL c, y1; \ 217 ORL y1, h; \ // h = (a AND b) OR (a AND c) OR (b AND c) 218 VPALIGNR $8, XDWORD2, XDWORD3, XTMP0; \ // XTMP0 = W[-6] = {w13,w12,w11,w10} 219 ADDL y0, h; \ // h = FF(a, b, c) + d + SS2 + W' = tt1 220 MOVL f, y1; \ 221 XORL g, y1; \ 222 ANDL e, y1; \ 223 VPXOR XTMP1, XTMP0, XTMP0; \ // XTMP0 = W[-6] ^ (W[-13] rol 7) 224 XORL g, y1; \ // y1 = GG2(e, f, g) = (e AND f) OR (NOT(e) AND g) 225 ADDL y1, y2; \ // y2 = GG2(e, f, g) + h + SS1 + W = tt2 226 ROLL $9, b; \ 227 ROLL $19, f; \ 228 VPALIGNR $12, XDWORD1, XDWORD2, XTMP1; \ // XTMP1 = W[-9] = {w10,w9,w8,w7} 229 P0(y2, y0, d); \ 230 VPXOR XDWORD0, XTMP1, XTMP1; \ // XTMP1 = W[-9] ^ W[-16] 231 232 #define ROUND_AND_SCHED_N_1_1(disp, const, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 233 ; \ // ############################# RND N + 1 ############################// 234 RORXL $20, a, y0; \ // y0 = a <<< 12 235 MOVL e, y2; \ 236 ADDL $const, y2; \ 237 ADDL y0, y2; \ // y2 = a <<< 12 + e + T 238 VPSHUFD $0xA5, XDWORD3, XTMP2; \ // XTMP2 = W[-3] {BBAA} {w14,w14,w13,w13} 239 ROLL $7, y2; \ // y2 = SS1 240 XORL y2, y0 \ // y0 = SS2 241 ADDL (disp + 1*4)(SP), y2; \ // y2 = SS1 + W 242 ADDL h, y2; \ // y2 = h + SS1 + W 243 VPSRLQ $17, XTMP2, XTMP2; \ // XTMP2 = W[-3] rol 15 {xBxA} 244 ADDL (disp + 1*4 + 32)(SP), y0; \ // y0 = SS2 + W' 245 ADDL d, y0; \ // y0 = d + SS2 + W' 246 MOVL a, y1; \ 247 ORL b, y1; \ 248 VPXOR XTMP1, XTMP2, XTMP2; \ // XTMP2 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {xxxA} 249 MOVL a, h; \ 250 ANDL b, h; \ 251 ANDL c, y1; \ 252 ORL y1, h; \ // h = (a AND b) OR (a AND c) OR (b AND c) 253 VPSHUFD $0x00, XTMP2, XTMP2; \ // XTMP2 = {AAAA} 254 ADDL y0, h; \ // h = FF(a, b, c) + d + SS2 + W' = tt1 255 MOVL f, y1; \ 256 XORL g, y1; \ 257 ANDL e, y1; \ 258 VPSRLQ $17, XTMP2, XTMP3; \ // XTMP3 = XTMP2 rol 15 {xxxA} 259 XORL g, y1; \ // y1 = GG2(e, f, g) = (e AND f) OR (NOT(e) AND g) 260 ADDL y1, y2; \ // y2 = GG2(e, f, g) + h + SS1 + W = tt2 261 ROLL $9, b; \ 262 ROLL $19, f; \ 263 VPSRLQ $9, XTMP2, XTMP4; \ // XTMP4 = XTMP2 rol 23 {xxxA} 264 P0(y2, y0, d); \ 265 VPXOR XTMP2, XTMP4, XTMP4; \ // XTMP4 = XTMP2 XOR (XTMP2 rol 23 {xxxA}) 266 267 #define ROUND_AND_SCHED_N_1_2(disp, const, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 268 ; \ // ############################# RND N + 2 ############################// 269 RORXL $20, a, y0; \ // y0 = a <<< 12 270 MOVL e, y2; \ 271 ADDL $const, y2; \ 272 ADDL y0, y2; \ // y2 = a <<< 12 + e + T 273 VPXOR XTMP4, XTMP3, XTMP4; \ // XTMP4 = XTMP2 ^ (XTMP2 rol 15 {xxxA}) ^ (XTMP2 rol 23 {xxxA}) 274 ROLL $7, y2; \ // y2 = SS1 275 XORL y2, y0 \ // y0 = SS2 276 ADDL (disp + 2*4)(SP), y2; \ // y2 = SS1 + W 277 ADDL h, y2; \ // y2 = h + SS1 + W 278 VPXOR XTMP4, XTMP0, XTMP2; \ // XTMP2 = {..., ..., W[1], W[0]} 279 ADDL (disp + 2*4 + 32)(SP), y0; \ // y0 = SS2 + W' 280 ADDL d, y0; \ // y0 = d + SS2 + W' 281 MOVL a, y1; \ 282 ORL b, y1; \ 283 VPALIGNR $4, XDWORD3, XTMP2, XTMP3; \ // XTMP3 = {W[0], w15, w14, w13} 284 MOVL a, h; \ 285 ANDL b, h; \ 286 ANDL c, y1; \ 287 ORL y1, h; \ // h = (a AND b) OR (a AND c) OR (b AND c) 288 VPSLLD $15, XTMP3, XTMP4; \ 289 ADDL y0, h; \ // h = FF(a, b, c) + d + SS2 + W' = tt1 290 MOVL f, y1; \ 291 XORL g, y1; \ 292 ANDL e, y1; \ 293 VPSRLD $(32-15), XTMP3, XTMP3; \ 294 XORL g, y1; \ // y1 = GG2(e, f, g) = (e AND f) OR (NOT(e) AND g) 295 ADDL y1, y2; \ // y2 = GG2(e, f, g) + h + SS1 + W = tt2 296 ROLL $9, b; \ 297 ROLL $19, f; \ 298 VPOR XTMP3, XTMP4, XTMP4; \ // XTMP4 = (W[-3] rol 15) {DCBA} 299 P0(y2, y0, d); \ 300 VPXOR XTMP1, XTMP4, XTMP4; \ // XTMP4 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {DCBA} 301 302 #define ROUND_AND_SCHED_N_1_3(disp, const, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 303 ; \ // ############################# RND N + 3 ############################// 304 RORXL $20, a, y0; \ // y0 = a <<< 12 305 MOVL e, y2; \ 306 ADDL $const, y2; \ 307 ADDL y0, y2; \ // y2 = a <<< 12 + e + T 308 VPSLLD $15, XTMP4, XTMP2; \ 309 ROLL $7, y2; \ // y2 = SS1 310 XORL y2, y0 \ // y0 = SS2 311 ADDL (disp + 3*4)(SP), y2; \ // y2 = SS1 + W 312 ADDL h, y2; \ // y2 = h + SS1 + W 313 VPSRLD $(32-15), XTMP4, XTMP3; \ 314 ADDL (disp + 3*4 + 32)(SP), y0; \ // y0 = SS2 + W' 315 ADDL d, y0; \ // y0 = d + SS2 + W' 316 MOVL a, y1; \ 317 ORL b, y1; \ 318 VPOR XTMP3, XTMP2, XTMP3; \ // XTMP3 = XTMP4 rol 15 {DCBA} 319 MOVL a, h; \ 320 ANDL b, h; \ 321 ANDL c, y1; \ 322 ORL y1, h; \ // h = (a AND b) OR (a AND c) OR (b AND c) 323 VPSHUFB R08_SHUFFLE_MASK, XTMP3, XTMP1; \ // XTMP1 = XTMP4 rol 23 {DCBA} 324 ADDL y0, h; \ // h = FF(a, b, c) + d + SS2 + W' = tt1 325 MOVL f, y1; \ 326 XORL g, y1; \ 327 ANDL e, y1; \ 328 VPXOR XTMP3, XTMP4, XTMP3; \ // XTMP3 = XTMP4 ^ (XTMP4 rol 15 {DCBA}) 329 XORL g, y1; \ // y1 = GG2(e, f, g) = (e AND f) OR (NOT(e) AND g) 330 ADDL y1, y2; \ // y2 = GG2(e, f, g) + h + SS1 + W = tt2 331 ROLL $9, b; \ 332 ROLL $19, f; \ 333 VPXOR XTMP3, XTMP1, XTMP1; \ // XTMP1 = XTMP4 ^ (XTMP4 rol 15 {DCBA}) ^ (XTMP4 rol 23 {DCBA}) 334 P0(y2, y0, d); \ 335 VPXOR XTMP1, XTMP0, XDWORD0; \ // XWORD0 = {W[3], W[2], W[1], W[0]} 336 337 #define SS12(a, e, const, ss1, ss2) \ 338 RORXL $20, a, ss2; \ 339 MOVL e, ss1; \ 340 ADDL $const, ss1; \ 341 ADDL ss2, ss1; \ 342 ROLL $7, ss1; \ // ss1 = (a <<< 12 + e + T) <<< 7 343 XORL ss1, ss2 344 345 // For rounds [0 - 16) 346 #define DO_ROUND_N_0(disp, idx, const, a, b, c, d, e, f, g, h) \ 347 ; \ // ############################# RND N + 0 ############################// 348 SS12(a, e, const, y2, y0); \ 349 ADDL (disp + idx*4)(SP), y2; \ // y2 = SS1 + W 350 ADDL h, y2; \ // y2 = h + SS1 + W 351 ADDL (disp + idx*4 + 32)(SP), y0; \ // y0 = SS2 + W' 352 ADDL d, y0; \ // y0 = d + SS2 + W' 353 ; \ 354 MOVL a, h; \ 355 XORL b, h; \ 356 XORL c, h; \ 357 ADDL y0, h; \ // h = FF(a, b, c) + d + SS2 + W' = tt1 358 ; \ 359 MOVL e, y1; \ 360 XORL f, y1; \ 361 XORL g, y1; \ 362 ADDL y1, y2; \ // y2 = GG(e, f, g) + h + SS1 + W = tt2 363 ; \ 364 ROLL $9, b; \ 365 ROLL $19, f; \ 366 ; \ 367 P0(y2, y0, d) 368 369 // For rounds [16 - 64) 370 #define DO_ROUND_N_1(disp, idx, const, a, b, c, d, e, f, g, h) \ 371 ; \ // ############################# RND N + 0 ############################// 372 SS12(a, e, const, y2, y0); \ 373 ADDL (disp + idx*4)(SP), y2; \ // y2 = SS1 + W 374 ADDL h, y2; \ // y2 = h + SS1 + W 375 ADDL (disp + idx*4 + 32)(SP), y0; \ // y0 = SS2 + W' 376 ADDL d, y0; \ // y0 = d + SS2 + W' 377 ; \ 378 MOVL a, y1; \ 379 ORL b, y1; \ 380 MOVL a, h; \ 381 ANDL b, h; \ 382 ANDL c, y1; \ 383 ORL y1, h; \ // h = (a AND b) OR (a AND c) OR (b AND c) 384 ADDL y0, h; \ // h = FF(a, b, c) + d + SS2 + W' = tt1 385 ; \ 386 MOVL f, y1; \ 387 XORL g, y1; \ 388 ANDL e, y1; \ 389 XORL g, y1; \ // y1 = GG2(e, f, g) 390 ADDL y1, y2; \ // y2 = GG2(e, f, g) + h + SS1 + W = tt2 391 ; \ 392 ROLL $9, b; \ 393 ROLL $19, f; \ 394 ; \ 395 P0(y2, y0, d) 396 397 TEXT ·blockAVX2(SB), 0, $1040-32 398 MOVQ dig+0(FP), CTX // d.h[8] 399 MOVQ p_base+8(FP), INP 400 MOVQ p_len+16(FP), NUM_BYTES 401 402 LEAQ -64(INP)(NUM_BYTES*1), NUM_BYTES // Pointer to the last block 403 MOVQ NUM_BYTES, _INP_END(SP) 404 405 VMOVDQU flip_mask<>(SB), BYTE_FLIP_MASK 406 VMOVDQU r08_mask<>(SB), R08_SHUFFLE_MASK 407 408 CMPQ NUM_BYTES, INP 409 JE avx2_only_one_block 410 411 // Load initial digest 412 MOVL 0(CTX), a // a = H0 413 MOVL 4(CTX), b // b = H1 414 MOVL 8(CTX), c // c = H2 415 MOVL 12(CTX), d // d = H3 416 MOVL 16(CTX), e // e = H4 417 MOVL 20(CTX), f // f = H5 418 MOVL 24(CTX), g // g = H6 419 MOVL 28(CTX), h // h = H7 420 421 avx2_loop: // at each iteration works with one block (512 bit) 422 423 VMOVDQU (0*32)(INP), XTMP0 424 VMOVDQU (1*32)(INP), XTMP1 425 VMOVDQU (2*32)(INP), XTMP2 426 VMOVDQU (3*32)(INP), XTMP3 427 428 // Apply Byte Flip Mask: LE -> BE 429 VPSHUFB BYTE_FLIP_MASK, XTMP0, XTMP0 430 VPSHUFB BYTE_FLIP_MASK, XTMP1, XTMP1 431 VPSHUFB BYTE_FLIP_MASK, XTMP2, XTMP2 432 VPSHUFB BYTE_FLIP_MASK, XTMP3, XTMP3 433 434 // Transpose data into high/low parts 435 VPERM2I128 $0x20, XTMP2, XTMP0, XDWORD0 // w19, w18, w17, w16; w3, w2, w1, w0 436 VPERM2I128 $0x31, XTMP2, XTMP0, XDWORD1 // w23, w22, w21, w20; w7, w6, w5, w4 437 VPERM2I128 $0x20, XTMP3, XTMP1, XDWORD2 // w27, w26, w25, w24; w11, w10, w9, w8 438 VPERM2I128 $0x31, XTMP3, XTMP1, XDWORD3 // w31, w30, w29, w28; w15, w14, w13, w12 439 440 avx2_last_block_enter: 441 ADDQ $64, INP 442 443 avx2_schedule_compress: // for w0 - w47 444 // Do 4 rounds and scheduling 445 VMOVDQU XDWORD0, (_XFER + 0*32)(SP) 446 VPXOR XDWORD0, XDWORD1, XFER 447 VMOVDQU XFER, (_XFER + 1*32)(SP) 448 ROUND_AND_SCHED_N_0_0(_XFER + 0*32, T0, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 449 ROUND_AND_SCHED_N_0_1(_XFER + 0*32, T1, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 450 ROUND_AND_SCHED_N_0_2(_XFER + 0*32, T2, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 451 ROUND_AND_SCHED_N_0_3(_XFER + 0*32, T3, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 452 453 // Do 4 rounds and scheduling 454 VMOVDQU XDWORD1, (_XFER + 2*32)(SP) 455 VPXOR XDWORD1, XDWORD2, XFER 456 VMOVDQU XFER, (_XFER + 3*32)(SP) 457 ROUND_AND_SCHED_N_0_0(_XFER + 2*32, T4, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 458 ROUND_AND_SCHED_N_0_1(_XFER + 2*32, T5, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 459 ROUND_AND_SCHED_N_0_2(_XFER + 2*32, T6, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 460 ROUND_AND_SCHED_N_0_3(_XFER + 2*32, T7, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 461 462 // Do 4 rounds and scheduling 463 VMOVDQU XDWORD2, (_XFER + 4*32)(SP) 464 VPXOR XDWORD2, XDWORD3, XFER 465 VMOVDQU XFER, (_XFER + 5*32)(SP) 466 ROUND_AND_SCHED_N_0_0(_XFER + 4*32, T8, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 467 ROUND_AND_SCHED_N_0_1(_XFER + 4*32, T9, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 468 ROUND_AND_SCHED_N_0_2(_XFER + 4*32, T10, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 469 ROUND_AND_SCHED_N_0_3(_XFER + 4*32, T11, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 470 471 // Do 4 rounds and scheduling 472 VMOVDQU XDWORD3, (_XFER + 6*32)(SP) 473 VPXOR XDWORD3, XDWORD0, XFER 474 VMOVDQU XFER, (_XFER + 7*32)(SP) 475 ROUND_AND_SCHED_N_0_0(_XFER + 6*32, T12, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 476 ROUND_AND_SCHED_N_0_1(_XFER + 6*32, T13, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 477 ROUND_AND_SCHED_N_0_2(_XFER + 6*32, T14, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 478 ROUND_AND_SCHED_N_0_3(_XFER + 6*32, T15, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 479 480 // Do 4 rounds and scheduling 481 VMOVDQU XDWORD0, (_XFER + 8*32)(SP) 482 VPXOR XDWORD0, XDWORD1, XFER 483 VMOVDQU XFER, (_XFER + 9*32)(SP) 484 ROUND_AND_SCHED_N_1_0(_XFER + 8*32, T16, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 485 ROUND_AND_SCHED_N_1_1(_XFER + 8*32, T17, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 486 ROUND_AND_SCHED_N_1_2(_XFER + 8*32, T18, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 487 ROUND_AND_SCHED_N_1_3(_XFER + 8*32, T19, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 488 489 // Do 4 rounds and scheduling 490 VMOVDQU XDWORD1, (_XFER + 10*32)(SP) 491 VPXOR XDWORD1, XDWORD2, XFER 492 VMOVDQU XFER, (_XFER + 11*32)(SP) 493 ROUND_AND_SCHED_N_1_0(_XFER + 10*32, T20, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 494 ROUND_AND_SCHED_N_1_1(_XFER + 10*32, T21, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 495 ROUND_AND_SCHED_N_1_2(_XFER + 10*32, T22, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 496 ROUND_AND_SCHED_N_1_3(_XFER + 10*32, T23, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 497 498 // Do 4 rounds and scheduling 499 VMOVDQU XDWORD2, (_XFER + 12*32)(SP) 500 VPXOR XDWORD2, XDWORD3, XFER 501 VMOVDQU XFER, (_XFER + 13*32)(SP) 502 ROUND_AND_SCHED_N_1_0(_XFER + 12*32, T24, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 503 ROUND_AND_SCHED_N_1_1(_XFER + 12*32, T25, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 504 ROUND_AND_SCHED_N_1_2(_XFER + 12*32, T26, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 505 ROUND_AND_SCHED_N_1_3(_XFER + 12*32, T27, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 506 507 // Do 4 rounds and scheduling 508 VMOVDQU XDWORD3, (_XFER + 14*32)(SP) 509 VPXOR XDWORD3, XDWORD0, XFER 510 VMOVDQU XFER, (_XFER + 15*32)(SP) 511 ROUND_AND_SCHED_N_1_0(_XFER + 14*32, T28, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 512 ROUND_AND_SCHED_N_1_1(_XFER + 14*32, T29, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 513 ROUND_AND_SCHED_N_1_2(_XFER + 14*32, T30, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 514 ROUND_AND_SCHED_N_1_3(_XFER + 14*32, T31, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 515 516 // Do 4 rounds and scheduling 517 VMOVDQU XDWORD0, (_XFER + 16*32)(SP) 518 VPXOR XDWORD0, XDWORD1, XFER 519 VMOVDQU XFER, (_XFER + 17*32)(SP) 520 ROUND_AND_SCHED_N_1_0(_XFER + 16*32, T32, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 521 ROUND_AND_SCHED_N_1_1(_XFER + 16*32, T33, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 522 ROUND_AND_SCHED_N_1_2(_XFER + 16*32, T34, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 523 ROUND_AND_SCHED_N_1_3(_XFER + 16*32, T35, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 524 525 // Do 4 rounds and scheduling 526 VMOVDQU XDWORD1, (_XFER + 18*32)(SP) 527 VPXOR XDWORD1, XDWORD2, XFER 528 VMOVDQU XFER, (_XFER + 19*32)(SP) 529 ROUND_AND_SCHED_N_1_0(_XFER + 18*32, T36, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 530 ROUND_AND_SCHED_N_1_1(_XFER + 18*32, T37, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 531 ROUND_AND_SCHED_N_1_2(_XFER + 18*32, T38, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 532 ROUND_AND_SCHED_N_1_3(_XFER + 18*32, T39, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 533 534 // Do 4 rounds and scheduling 535 VMOVDQU XDWORD2, (_XFER + 20*32)(SP) 536 VPXOR XDWORD2, XDWORD3, XFER 537 VMOVDQU XFER, (_XFER + 21*32)(SP) 538 ROUND_AND_SCHED_N_1_0(_XFER + 20*32, T40, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 539 ROUND_AND_SCHED_N_1_1(_XFER + 20*32, T41, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 540 ROUND_AND_SCHED_N_1_2(_XFER + 20*32, T42, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 541 ROUND_AND_SCHED_N_1_3(_XFER + 20*32, T43, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 542 543 // Do 4 rounds and scheduling 544 VMOVDQU XDWORD3, (_XFER + 22*32)(SP) 545 VPXOR XDWORD3, XDWORD0, XFER 546 VMOVDQU XFER, (_XFER + 23*32)(SP) 547 ROUND_AND_SCHED_N_1_0(_XFER + 22*32, T44, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 548 ROUND_AND_SCHED_N_1_1(_XFER + 22*32, T45, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 549 ROUND_AND_SCHED_N_1_2(_XFER + 22*32, T46, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 550 ROUND_AND_SCHED_N_1_3(_XFER + 22*32, T47, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 551 552 // w48 - w63 processed with only 4 rounds scheduling (last 16 rounds) 553 // Do 4 rounds and scheduling 554 VMOVDQU XDWORD0, (_XFER + 24*32)(SP) 555 VPXOR XDWORD0, XDWORD1, XFER 556 VMOVDQU XFER, (_XFER + 25*32)(SP) 557 ROUND_AND_SCHED_N_1_0(_XFER + 24*32, T48, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 558 ROUND_AND_SCHED_N_1_1(_XFER + 24*32, T49, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 559 ROUND_AND_SCHED_N_1_2(_XFER + 24*32, T50, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 560 ROUND_AND_SCHED_N_1_3(_XFER + 24*32, T51, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 561 562 // w52 - w63 processed with no scheduling (last 12 rounds) 563 // Do 4 rounds 564 VMOVDQU XDWORD1, (_XFER + 26*32)(SP) 565 VPXOR XDWORD1, XDWORD2, XFER 566 VMOVDQU XFER, (_XFER + 27*32)(SP) 567 DO_ROUND_N_1(_XFER + 26*32, 0, T52, e, f, g, h, a, b, c, d) 568 DO_ROUND_N_1(_XFER + 26*32, 1, T53, d, e, f, g, h, a, b, c) 569 DO_ROUND_N_1(_XFER + 26*32, 2, T54, c, d, e, f, g, h, a, b) 570 DO_ROUND_N_1(_XFER + 26*32, 3, T55, b, c, d, e, f, g, h, a) 571 572 // Do 4 rounds 573 VMOVDQU XDWORD2, (_XFER + 28*32)(SP) 574 VPXOR XDWORD2, XDWORD3, XFER 575 VMOVDQU XFER, (_XFER + 29*32)(SP) 576 DO_ROUND_N_1(_XFER + 28*32, 0, T56, a, b, c, d, e, f, g, h) 577 DO_ROUND_N_1(_XFER + 28*32, 1, T57, h, a, b, c, d, e, f, g) 578 DO_ROUND_N_1(_XFER + 28*32, 2, T58, g, h, a, b, c, d, e, f) 579 DO_ROUND_N_1(_XFER + 28*32, 3, T59, f, g, h, a, b, c, d, e) 580 581 // Do 4 rounds 582 VMOVDQU XDWORD3, (_XFER + 30*32)(SP) 583 VPXOR XDWORD3, XDWORD0, XFER 584 VMOVDQU XFER, (_XFER + 31*32)(SP) 585 DO_ROUND_N_1(_XFER + 30*32, 0, T60, e, f, g, h, a, b, c, d) 586 DO_ROUND_N_1(_XFER + 30*32, 1, T61, d, e, f, g, h, a, b, c) 587 DO_ROUND_N_1(_XFER + 30*32, 2, T62, c, d, e, f, g, h, a, b) 588 DO_ROUND_N_1(_XFER + 30*32, 3, T63, b, c, d, e, f, g, h, a) 589 590 xorm( 0(CTX), a) 591 xorm( 4(CTX), b) 592 xorm( 8(CTX), c) 593 xorm( 12(CTX), d) 594 xorm( 16(CTX), e) 595 xorm( 20(CTX), f) 596 xorm( 24(CTX), g) 597 xorm( 28(CTX), h) 598 599 CMPQ _INP_END(SP), INP 600 JB done_hash 601 602 avx2_compress: // Do second block using previously scheduled results 603 DO_ROUND_N_0(_XFER + 0*32 + 16, 0, T0, a, b, c, d, e, f, g, h) 604 DO_ROUND_N_0(_XFER + 0*32 + 16, 1, T1, h, a, b, c, d, e, f, g) 605 DO_ROUND_N_0(_XFER + 0*32 + 16, 2, T2, g, h, a, b, c, d, e, f) 606 DO_ROUND_N_0(_XFER + 0*32 + 16, 3, T3, f, g, h, a, b, c, d, e) 607 608 DO_ROUND_N_0(_XFER + 2*32 + 16, 0, T4, e, f, g, h, a, b, c, d) 609 DO_ROUND_N_0(_XFER + 2*32 + 16, 1, T5, d, e, f, g, h, a, b, c) 610 DO_ROUND_N_0(_XFER + 2*32 + 16, 2, T6, c, d, e, f, g, h, a, b) 611 DO_ROUND_N_0(_XFER + 2*32 + 16, 3, T7, b, c, d, e, f, g, h, a) 612 613 DO_ROUND_N_0(_XFER + 4*32 + 16, 0, T8, a, b, c, d, e, f, g, h) 614 DO_ROUND_N_0(_XFER + 4*32 + 16, 1, T9, h, a, b, c, d, e, f, g) 615 DO_ROUND_N_0(_XFER + 4*32 + 16, 2, T10, g, h, a, b, c, d, e, f) 616 DO_ROUND_N_0(_XFER + 4*32 + 16, 3, T11, f, g, h, a, b, c, d, e) 617 618 DO_ROUND_N_0(_XFER + 6*32 + 16, 0, T12, e, f, g, h, a, b, c, d) 619 DO_ROUND_N_0(_XFER + 6*32 + 16, 1, T13, d, e, f, g, h, a, b, c) 620 DO_ROUND_N_0(_XFER + 6*32 + 16, 2, T14, c, d, e, f, g, h, a, b) 621 DO_ROUND_N_0(_XFER + 6*32 + 16, 3, T15, b, c, d, e, f, g, h, a) 622 623 DO_ROUND_N_1(_XFER + 8*32 + 16, 0, T16, a, b, c, d, e, f, g, h) 624 DO_ROUND_N_1(_XFER + 8*32 + 16, 1, T17, h, a, b, c, d, e, f, g) 625 DO_ROUND_N_1(_XFER + 8*32 + 16, 2, T18, g, h, a, b, c, d, e, f) 626 DO_ROUND_N_1(_XFER + 8*32 + 16, 3, T19, f, g, h, a, b, c, d, e) 627 628 DO_ROUND_N_1(_XFER + 10*32 + 16, 0, T20, e, f, g, h, a, b, c, d) 629 DO_ROUND_N_1(_XFER + 10*32 + 16, 1, T21, d, e, f, g, h, a, b, c) 630 DO_ROUND_N_1(_XFER + 10*32 + 16, 2, T22, c, d, e, f, g, h, a, b) 631 DO_ROUND_N_1(_XFER + 10*32 + 16, 3, T23, b, c, d, e, f, g, h, a) 632 633 DO_ROUND_N_1(_XFER + 12*32 + 16, 0, T24, a, b, c, d, e, f, g, h) 634 DO_ROUND_N_1(_XFER + 12*32 + 16, 1, T25, h, a, b, c, d, e, f, g) 635 DO_ROUND_N_1(_XFER + 12*32 + 16, 2, T26, g, h, a, b, c, d, e, f) 636 DO_ROUND_N_1(_XFER + 12*32 + 16, 3, T27, f, g, h, a, b, c, d, e) 637 638 DO_ROUND_N_1(_XFER + 14*32 + 16, 0, T28, e, f, g, h, a, b, c, d) 639 DO_ROUND_N_1(_XFER + 14*32 + 16, 1, T29, d, e, f, g, h, a, b, c) 640 DO_ROUND_N_1(_XFER + 14*32 + 16, 2, T30, c, d, e, f, g, h, a, b) 641 DO_ROUND_N_1(_XFER + 14*32 + 16, 3, T31, b, c, d, e, f, g, h, a) 642 643 DO_ROUND_N_1(_XFER + 16*32 + 16, 0, T32, a, b, c, d, e, f, g, h) 644 DO_ROUND_N_1(_XFER + 16*32 + 16, 1, T33, h, a, b, c, d, e, f, g) 645 DO_ROUND_N_1(_XFER + 16*32 + 16, 2, T34, g, h, a, b, c, d, e, f) 646 DO_ROUND_N_1(_XFER + 16*32 + 16, 3, T35, f, g, h, a, b, c, d, e) 647 648 DO_ROUND_N_1(_XFER + 18*32 + 16, 0, T36, e, f, g, h, a, b, c, d) 649 DO_ROUND_N_1(_XFER + 18*32 + 16, 1, T37, d, e, f, g, h, a, b, c) 650 DO_ROUND_N_1(_XFER + 18*32 + 16, 2, T38, c, d, e, f, g, h, a, b) 651 DO_ROUND_N_1(_XFER + 18*32 + 16, 3, T39, b, c, d, e, f, g, h, a) 652 653 DO_ROUND_N_1(_XFER + 20*32 + 16, 0, T40, a, b, c, d, e, f, g, h) 654 DO_ROUND_N_1(_XFER + 20*32 + 16, 1, T41, h, a, b, c, d, e, f, g) 655 DO_ROUND_N_1(_XFER + 20*32 + 16, 2, T42, g, h, a, b, c, d, e, f) 656 DO_ROUND_N_1(_XFER + 20*32 + 16, 3, T43, f, g, h, a, b, c, d, e) 657 658 DO_ROUND_N_1(_XFER + 22*32 + 16, 0, T44, e, f, g, h, a, b, c, d) 659 DO_ROUND_N_1(_XFER + 22*32 + 16, 1, T45, d, e, f, g, h, a, b, c) 660 DO_ROUND_N_1(_XFER + 22*32 + 16, 2, T46, c, d, e, f, g, h, a, b) 661 DO_ROUND_N_1(_XFER + 22*32 + 16, 3, T47, b, c, d, e, f, g, h, a) 662 663 DO_ROUND_N_1(_XFER + 24*32 + 16, 0, T48, a, b, c, d, e, f, g, h) 664 DO_ROUND_N_1(_XFER + 24*32 + 16, 1, T49, h, a, b, c, d, e, f, g) 665 DO_ROUND_N_1(_XFER + 24*32 + 16, 2, T50, g, h, a, b, c, d, e, f) 666 DO_ROUND_N_1(_XFER + 24*32 + 16, 3, T51, f, g, h, a, b, c, d, e) 667 668 DO_ROUND_N_1(_XFER + 26*32 + 16, 0, T52, e, f, g, h, a, b, c, d) 669 DO_ROUND_N_1(_XFER + 26*32 + 16, 1, T53, d, e, f, g, h, a, b, c) 670 DO_ROUND_N_1(_XFER + 26*32 + 16, 2, T54, c, d, e, f, g, h, a, b) 671 DO_ROUND_N_1(_XFER + 26*32 + 16, 3, T55, b, c, d, e, f, g, h, a) 672 673 DO_ROUND_N_1(_XFER + 28*32 + 16, 0, T56, a, b, c, d, e, f, g, h) 674 DO_ROUND_N_1(_XFER + 28*32 + 16, 1, T57, h, a, b, c, d, e, f, g) 675 DO_ROUND_N_1(_XFER + 28*32 + 16, 2, T58, g, h, a, b, c, d, e, f) 676 DO_ROUND_N_1(_XFER + 28*32 + 16, 3, T59, f, g, h, a, b, c, d, e) 677 678 DO_ROUND_N_1(_XFER + 30*32 + 16, 0, T60, e, f, g, h, a, b, c, d) 679 DO_ROUND_N_1(_XFER + 30*32 + 16, 1, T61, d, e, f, g, h, a, b, c) 680 DO_ROUND_N_1(_XFER + 30*32 + 16, 2, T62, c, d, e, f, g, h, a, b) 681 DO_ROUND_N_1(_XFER + 30*32 + 16, 3, T63, b, c, d, e, f, g, h, a) 682 683 ADDQ $64, INP 684 685 xorm( 0(CTX), a) 686 xorm( 4(CTX), b) 687 xorm( 8(CTX), c) 688 xorm( 12(CTX), d) 689 xorm( 16(CTX), e) 690 xorm( 20(CTX), f) 691 xorm( 24(CTX), g) 692 xorm( 28(CTX), h) 693 694 CMPQ _INP_END(SP), INP 695 JA avx2_loop 696 JB done_hash 697 698 avx2_do_last_block: 699 700 VMOVDQU 0(INP), XWORD0 701 VMOVDQU 16(INP), XWORD1 702 VMOVDQU 32(INP), XWORD2 703 VMOVDQU 48(INP), XWORD3 704 705 VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0 706 VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1 707 VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2 708 VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3 709 710 JMP avx2_last_block_enter 711 712 avx2_only_one_block: 713 // Load initial digest 714 MOVL 0(CTX), a // a = H0 715 MOVL 4(CTX), b // b = H1 716 MOVL 8(CTX), c // c = H2 717 MOVL 12(CTX), d // d = H3 718 MOVL 16(CTX), e // e = H4 719 MOVL 20(CTX), f // f = H5 720 MOVL 24(CTX), g // g = H6 721 MOVL 28(CTX), h // h = H7 722 723 JMP avx2_do_last_block 724 725 done_hash: 726 VZEROUPPER 727 RET 728 729 // shuffle byte order from LE to BE 730 DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203 731 DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b 732 DATA flip_mask<>+0x10(SB)/8, $0x0405060700010203 733 DATA flip_mask<>+0x18(SB)/8, $0x0c0d0e0f08090a0b 734 GLOBL flip_mask<>(SB), 8, $32 735 736 DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003 737 DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B 738 DATA r08_mask<>+0x10(SB)/8, $0x0605040702010003 739 DATA r08_mask<>+0x18(SB)/8, $0x0E0D0C0F0A09080B 740 GLOBL r08_mask<>(SB), 8, $32