github.com/hellobchain/newcryptosm@v0.0.0-20221019060107-edb949a317e9/sm3/sm3block_amd64.s (about) 1 #include "textflag.h" 2 3 #define xorm(P1, P2) \ 4 XORL P2, P1; \ 5 MOVL P1, P2 6 7 #define XDWORD0 Y4 8 #define XDWORD1 Y5 9 #define XDWORD2 Y6 10 #define XDWORD3 Y7 11 #define XDWORD4 Y8 12 13 #define XWORD0 X4 14 #define XWORD1 X5 15 #define XWORD2 X6 16 #define XWORD3 X7 17 #define XWORD4 X8 18 19 #define XTMP0 Y0 20 #define XTMP1 Y1 21 #define XTMP2 Y2 22 #define XTMP3 Y3 23 #define XTMP4 Y10 24 #define XTMP5 Y11 25 26 #define a AX 27 #define b BX 28 #define c CX 29 #define d R8 30 #define e DX 31 #define f R9 32 #define g R10 33 #define h R11 34 35 #define T1 R12 36 #define y0 R13 37 #define y1 R14 38 #define y2 R15 39 #define y3 DI 40 41 // mask to convert LE -> BE 42 #define BYTE_FLIP_MASK Y13 43 #define X_BYTE_FLIP_MASK X13 //low half of Y13 44 45 #define NUM_BYTES DX 46 #define INP DI 47 48 #define CTX SI 49 #define SRND SI 50 #define TBL BP 51 52 // Offsets 53 #define XFER_SIZE 2*64*4 54 #define INP_END_SIZE 8 55 #define INP_SIZE 8 56 57 #define _XFER 0 58 #define _INP_END _XFER + XFER_SIZE 59 #define _INP _INP_END + INP_END_SIZE 60 #define STACK_SIZE _INP + INP_SIZE 61 62 #define ROUND_AND_SCHED_0_15_0(wj, wj2, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 63 MOVL e, y2; \ // y2=E 64 RORXL $20, a, y1; \ // y1=A<<<12 65 ADDL 0*4(TBL)(SRND*1), y2; \ // y2=E+Ti 66 VPALIGNR $12, XDWORD0, XDWORD1, XTMP0;\ //XTMP0 = W[-13] 67 ADDL y1, y2; \ // y2=(A<<<12)+E+Ti 68 RORXL $25, y2, y0; \ // y0=((A<<<12)+E+Ti)<<<7=SS1 69 XORL y0, y1; \ // y1=SS1^(A<<<12)=SS2 70 VPSLLD $7, XTMP0, XTMP1; \ 71 ; \ 72 ADDL (wj2 + 0*4)(SP)(SRND*1), d; \ 73 MOVL a, T1; \ 74 XORL b, T1; \ 75 VPSRLD $(32-7), XTMP0, XTMP2; \ 76 XORL c, T1; \ 77 ADDL T1, d; \ 78 ADDL y1, d; \ // d=TT1 79 VPOR XTMP1, XTMP2, XTMP3; \ // XTMP3 = W[-13] <<< 7 80 ; \ 81 ADDL (wj + 0*4)(SP)(SRND*1), h; \ 82 MOVL e, y3; \ 83 XORL f, y3; \ 84 VPALIGNR $8, XDWORD2, XDWORD3, XTMP1; \ // XTMP1 = W[-6] 85 XORL g, y3; \ 86 ADDL y3, h; \ 87 ADDL y0, h; \ // h=TT2 88 VPXOR XTMP3, XTMP1, XTMP1; \ // XTMP1 = W[-6] ^ (W[-13]<<<7) outside 89 ; \ 90 RORXL $23, h, y2; \ 91 RORXL $15, h, y3; \ 92 XORL h, y2; \ 93 VPALIGNR $12, XDWORD1, XDWORD2, XTMP0;\ // XTMP0 = W[-9] 94 ; \ 95 MOVL d, h; \ 96 XORL y2, y3; \ 97 MOVL y3, d; \ 98 VPXOR XDWORD0, XTMP0, XTMP0; \ // XTMP0 = W[-9] ^ W[-16] inside 99 ; \ 100 RORXL $23, b, b; \ 101 RORXL $13, f, f; \ 102 VPSHUFD $0xA5, XDWORD3, XTMP2 // XTMP2 = W[-3] {BBAA} 待扩展 103 104 #define ROUND_AND_SCHED_0_15_1(wj, wj2, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 105 MOVL e, y2; \ // y2=E 106 RORXL $20, a, y1; \ // y1=A<<<12 107 ADDL 1*4(TBL)(SRND*1), y2; \ // y2=E+Ti 108 VPSLLQ $15, XTMP2, XTMP3; \ // XTMP3 = W[-3] <<< 15 {BxAx} 109 ADDL y1, y2; \ // y2=(A<<<12)+E+Ti 110 RORXL $25, y2, y0; \ // y0=((A<<<12)+E+Ti)<<<7=SS1 111 XORL y0, y1; \ // y1=SS1^(A<<<12)=SS2 112 VPSHUFB shuff_00BA<>(SB), XTMP3, XTMP3;\ // XTMP3 = s1 {00BA} 113 ; \ 114 ADDL (wj2 + 1*4)(SP)(SRND*1), d; \ 115 MOVL a, T1; \ 116 XORL b, T1; \ 117 VPXOR XTMP0, XTMP3, XTMP3; \ // XTMP3 = x {xxBA} store to use 118 XORL c, T1; \ 119 ADDL T1, d; \ 120 ADDL y1, d; \ // d=TT1 121 VPSLLD $15, XTMP3, XTMP2; \ // XTMP2 = x << 15 122 ; \ 123 ADDL (wj + 1*4)(SP)(SRND*1), h; \ 124 MOVL e, y3; \ 125 XORL f, y3; \ 126 VPSRLD $(32-15), XTMP3, XTMP4; \ // XTMP4 = x >> (32-15) 127 XORL g, y3; \ 128 ADDL y3, h; \ 129 ADDL y0, h; \ // h=TT2 130 VPOR XTMP2, XTMP4, XTMP5; \ // XTMP5 = x <<< 15 (xxBA) 131 ; \ 132 RORXL $23, h, y2; \ 133 RORXL $15, h, y3; \ 134 XORL h, y2; \ 135 VPXOR XTMP3, XTMP5, XTMP5; \ // XTMP5 = x ^ (x <<< 15) (xxBA) 136 ; \ 137 MOVL d, h; \ 138 XORL y2, y3; \ 139 MOVL y3, d; \ 140 VPSLLD $23, XTMP3, XTMP2; \ // XTMP3 << 23 141 ; \ 142 RORXL $23, b, b; \ 143 RORXL $13, f, f; \ 144 VPSRLD $(32-23), XTMP3, XTMP4 // XTMP3 >> (32-23) 145 146 #define ROUND_AND_SCHED_0_15_2(wj, wj2, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 147 MOVL e, y2; \ // y2=E 148 RORXL $20, a, y1; \ // y1=A<<<12 149 ADDL 2*4(TBL)(SRND*1), y2; \ // y2=E+Ti 150 VPOR XTMP2, XTMP4, XTMP4; \ // XTMP4 = x <<< 23 (xxBA) 151 ADDL y1, y2; \ // y2=(A<<<12)+E+Ti 152 RORXL $25, y2, y0; \ // y0=((A<<<12)+E+Ti)<<<7=SS1 153 XORL y0, y1; \ // y1=SS1^(A<<<12)=SS2 154 VPXOR XTMP5, XTMP4, XTMP4; \ // XTMP4 = x ^ (x <<< 15) ^ (x <<< 23) (xxBA) 155 ; \ 156 ADDL (wj2 + 2*4)(SP)(SRND*1), d; \ 157 MOVL a, T1; \ 158 XORL b, T1; \ 159 VPXOR XTMP4, XTMP1, XTMP2; \ // XTMP2 = {. ,. , w1, w0} 160 XORL c, T1; \ 161 ADDL T1, d; \ 162 ADDL y1, d; \ // d=TT1 163 VPALIGNR $4, XDWORD3, XTMP2, XTMP3; \ // XTMP3 = DCBA 164 ; \ 165 ADDL (wj + 2*4)(SP)(SRND*1), h; \ 166 MOVL e, y3; \ 167 XORL f, y3; \ 168 VPSLLD $15, XTMP3, XTMP4; \ // XTMP4 = W[-3] << 15 169 XORL g, y3; \ 170 ADDL y3, h; \ 171 ADDL y0, h; \ // h=TT2 172 VPSRLD $(32-15), XTMP3, XTMP5; \ // XTMP5 = W[-3] >> (32-15) 173 ; \ 174 RORXL $23, h, y2; \ 175 RORXL $15, h, y3; \ 176 XORL h, y2; \ 177 VPOR XTMP4, XTMP5, XTMP5; \ // XTMP5 = W[-3] <<< 15 {DCBA} 178 ; \ 179 MOVL d, h; \ 180 XORL y2, y3; \ 181 MOVL y3, d; \ 182 VPXOR XTMP0, XTMP5, XTMP3; \ // XTMP3 = x {DCBA} 183 ; \ 184 RORXL $23, b, b; \ 185 RORXL $13, f, f; \ 186 VPSLLD $15, XTMP3, XTMP4 // XTMP4 = XTMP3 << 15 187 188 #define ROUND_AND_SCHED_0_15_3(wj, wj2, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 189 MOVL e, y2; \ // y2=E 190 RORXL $20, a, y1; \ // y1=A<<<12 191 ADDL 3*4(TBL)(SRND*1), y2; \ // y2=E+Ti 192 VPSRLD $(32-15), XTMP3, XTMP5; \ // XTMP5 = XTMP3 >> (32-15) 193 ADDL y1, y2; \ // y2=(A<<<12)+E+Ti 194 RORXL $25, y2, y0; \ // y0=((A<<<12)+E+Ti)<<<7=SS1 195 XORL y0, y1; \ // y1=SS1^(A<<<12)=SS2 196 VPOR XTMP5, XTMP4, XTMP4; \ // XTMP4 = x <<< 15 (DCBA) 197 ; \ 198 ADDL (wj2 + 3*4)(SP)(SRND*1), d; \ 199 MOVL a, T1; \ 200 XORL b, T1; \ 201 VPXOR XTMP3, XTMP4, XTMP4; \ // XTMP4 = x ^ (x <<< 15) (DCBA) 202 XORL c, T1; \ 203 ADDL T1, d; \ 204 ADDL y1, d; \ // d=TT1 205 VPSLLD $23, XTMP3, XTMP5; \ // XTMP5 = XTMP3 << 23 206 ; \ 207 ADDL (wj + 3*4)(SP)(SRND*1), h; \ 208 MOVL e, y3; \ 209 XORL f, y3; \ 210 VPSRLD $(32-23), XTMP3, XTMP3; \ // XTMP3 >> (32-23) XTMP3 still useful? 211 XORL g, y3; \ 212 ADDL y3, h; \ 213 ADDL y0, h; \ // h=TT2 214 VPOR XTMP3, XTMP5, XTMP5; \ // XTMP5 = x <<< 23 215 ; \ 216 RORXL $23, h, y2; \ 217 RORXL $15, h, y3; \ 218 XORL h, y2; \ 219 VPXOR XTMP5, XTMP4, XTMP4; \ // XTMP4 = x ^ (x <<< 15) ^ (x <<< 23) (DCBA) 220 ; \ 221 MOVL d, h; \ 222 XORL y2, y3; \ 223 MOVL y3, d; \ 224 VPXOR XTMP4, XTMP1, XDWORD0; \ // XDWORD0 = {W3, W2, W1, W0,} 225 ; \ 226 RORXL $23, b, b; \ 227 RORXL $13, f, f 228 229 230 #define ROUND_AND_SCHED_16_63_0(wj, wj2, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 231 MOVL e, y2; \ // y2=E 232 RORXL $20, a, y1; \ // y1=A<<<12 233 VPALIGNR $12, XDWORD0, XDWORD1, XTMP0;\ //XTMP0 = W[-13] 234 ADDL 0*4(TBL)(SRND*1), y2; \ // y2=E+Ti 235 ADDL y1, y2; \ // y2=(A<<<12)+E+Ti 236 RORXL $25, y2, y0; \ // y0=((A<<<12)+E+Ti)<<<7=SS1 237 XORL y0, y1; \ // y1=SS1^(A<<<12)=SS2 238 VPSLLD $7, XTMP0, XTMP1; \ 239 ; \ 240 ADDL (wj2 + 0*4)(SP)(SRND*1), d; \ 241 MOVL a, T1; \ 242 ORL c, T1; \ // a|c 243 ANDL b, T1; \ //(a|c)&b 244 VPSRLD $(32-7), XTMP0, XTMP2; \ 245 MOVL c, y2; \ 246 ANDL a, y2; \ 247 ORL y2, T1; \ // (a|c)&b | a&c 248 VPOR XTMP1, XTMP2, XTMP3; \ // XTMP3 = W[-13] <<< 7 249 ADDL T1, d; \ 250 ADDL y1, d; \ // d=TT1 251 ; \ 252 ADDL (wj + 0*4)(SP)(SRND*1), h; \ 253 VPALIGNR $8, XDWORD2, XDWORD3, XTMP1; \ // XTMP1 = W[-6] 254 MOVL e, y3; \ 255 ANDL f, y3; \ 256 ANDNL g, e, y2; \ 257 VPXOR XTMP3, XTMP1, XTMP1; \ // XTMP1 = W[-6] ^ (W[-13]<<<7) outside 258 ORL y2, y3; \ 259 ADDL y3, h; \ 260 ADDL y0, h; \ // h=TT2 261 ; \ 262 VPALIGNR $12, XDWORD1, XDWORD2, XTMP0;\ // XTMP0 = W[-9] 263 RORXL $23, h, y2; \ 264 RORXL $15, h, y3; \ 265 XORL h, y2; \ 266 ; \ 267 VPXOR XDWORD0, XTMP0, XTMP0; \ // XTMP0 = W[-9] ^ W[-16] inside 268 MOVL d, h; \ 269 XORL y2, y3; \ 270 MOVL y3, d; \ 271 ; \ 272 VPSHUFD $0xA5, XDWORD3, XTMP2; \ // XTMP2 = W[-3] {BBAA} 待扩展 273 RORXL $23, b, b; \ 274 RORXL $13, f, f 275 276 #define ROUND_AND_SCHED_16_63_1(wj, wj2, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 277 MOVL e, y2; \ // y2=E 278 RORXL $20, a, y1; \ // y1=A<<<12 279 VPSLLQ $15, XTMP2, XTMP3; \ // XTMP3 = W[-3] <<< 15 {BxAx} 280 ADDL 1*4(TBL)(SRND*1), y2; \ // y2=E+Ti 281 ADDL y1, y2; \ // y2=(A<<<12)+E+Ti 282 RORXL $25, y2, y0; \ // y0=((A<<<12)+E+Ti)<<<7=SS1 283 XORL y0, y1; \ // y1=SS1^(A<<<12)=SS2 284 VPSHUFB shuff_00BA<>(SB), XTMP3, XTMP3;\ // XTMP3 = s1 {00BA} 285 ; \ 286 ADDL (wj2 + 1*4)(SP)(SRND*1), d; \ 287 MOVL a, T1; \ 288 ORL c, T1; \ // a|c 289 ANDL b, T1; \ //(a|c)&b 290 VPXOR XTMP0, XTMP3, XTMP3; \ // XTMP3 = x {xxBA} store to use 291 MOVL c, y2; \ 292 ANDL a, y2; \ 293 ORL y2, T1; \ // (a|c)&b | a&c 294 VPSLLD $15, XTMP3, XTMP2; \ // XTMP2 = x << 15 295 ADDL T1, d; \ 296 ADDL y1, d; \ // d=TT1 297 ; \ 298 ADDL (wj + 1*4)(SP)(SRND*1), h; \ 299 VPSRLD $(32-15), XTMP3, XTMP4; \ // XTMP4 = x >> (32-15) 300 MOVL e, y3; \ 301 ANDL f, y3; \ 302 ANDNL g, e, y2; \ 303 VPOR XTMP2, XTMP4, XTMP5; \ // XTMP5 = x <<< 15 (xxBA) 304 ORL y2, y3; \ 305 ADDL y3, h; \ 306 ADDL y0, h; \ // h=TT2 307 ; \ 308 VPXOR XTMP3, XTMP5, XTMP5; \ // XTMP5 = x ^ (x <<< 15) (xxBA) 309 RORXL $23, h, y2; \ 310 RORXL $15, h, y3; \ 311 XORL h, y2; \ 312 ; \ 313 VPSLLD $23, XTMP3, XTMP2; \ // XTMP3 << 23 314 MOVL d, h; \ 315 XORL y2, y3; \ 316 MOVL y3, d; \ 317 ; \ 318 VPSRLD $(32-23), XTMP3, XTMP4; \ // XTMP3 >> (32-23) 319 RORXL $23, b, b; \ 320 RORXL $13, f, f 321 322 #define ROUND_AND_SCHED_16_63_2(wj, wj2, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 323 MOVL e, y2; \ // y2=E 324 RORXL $20, a, y1; \ // y1=A<<<12 325 VPOR XTMP2, XTMP4, XTMP4; \ // XTMP4 = x <<< 23 (xxBA) 326 ADDL 2*4(TBL)(SRND*1), y2; \ // y2=E+Ti 327 ADDL y1, y2; \ // y2=(A<<<12)+E+Ti 328 RORXL $25, y2, y0; \ // y0=((A<<<12)+E+Ti)<<<7=SS1 329 XORL y0, y1; \ // y1=SS1^(A<<<12)=SS2 330 VPXOR XTMP5, XTMP4, XTMP4; \ // XTMP4 = x ^ (x <<< 15) ^ (x <<< 23) (xxBA) 331 ; \ 332 ADDL (wj2 + 2*4)(SP)(SRND*1), d; \ 333 MOVL a, T1; \ 334 ORL c, T1; \ // a|c 335 ANDL b, T1; \ //(a|c)&b 336 VPXOR XTMP4, XTMP1, XTMP2; \ // XTMP2 = {. ,. , w1, w0} 337 MOVL c, y2; \ 338 ANDL a, y2; \ 339 ORL y2, T1; \ // (a|c)&b | a&c 340 VPALIGNR $4, XDWORD3, XTMP2, XTMP3; \ // XTMP3 = DCBA 341 ADDL T1, d; \ 342 ADDL y1, d; \ // d=TT1 343 ; \ 344 ADDL (wj + 2*4)(SP)(SRND*1), h; \ 345 VPSLLD $15, XTMP3, XTMP4; \ // XTMP4 = W[-3] << 15 346 MOVL e, y3; \ 347 ANDL f, y3; \ 348 ANDNL g, e, y2; \ 349 VPSRLD $(32-15), XTMP3, XTMP5; \ // XTMP5 = W[-3] >> (32-15) 350 ORL y2, y3; \ 351 ADDL y3, h; \ 352 ADDL y0, h; \ // h=TT2 353 ; \ 354 VPOR XTMP4, XTMP5, XTMP5; \ // XTMP5 = W[-3] <<< 15 {DCBA} 355 RORXL $23, h, y2; \ 356 RORXL $15, h, y3; \ 357 XORL h, y2; \ 358 ; \ 359 VPXOR XTMP0, XTMP5, XTMP3; \ // XTMP3 = x {DCBA} 360 MOVL d, h; \ 361 XORL y2, y3; \ 362 MOVL y3, d; \ 363 ; \ 364 VPSLLD $15, XTMP3, XTMP4; \ // XTMP4 = XTMP3 << 15 365 RORXL $23, b, b; \ 366 RORXL $13, f, f 367 368 #define ROUND_AND_SCHED_16_63_3(wj, wj2, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \ 369 MOVL e, y2; \ // y2=E 370 RORXL $20, a, y1; \ // y1=A<<<12 371 VPSRLD $(32-15), XTMP3, XTMP5; \ // XTMP5 = XTMP3 >> (32-15) 372 ADDL 3*4(TBL)(SRND*1), y2; \ // y2=E+Ti 373 ADDL y1, y2; \ // y2=(A<<<12)+E+Ti 374 RORXL $25, y2, y0; \ // y0=((A<<<12)+E+Ti)<<<7=SS1 375 XORL y0, y1; \ // y1=SS1^(A<<<12)=SS2 376 VPOR XTMP5, XTMP4, XTMP4; \ // XTMP4 = x <<< 15 (DCBA) 377 ; \ 378 ADDL (wj2 + 3*4)(SP)(SRND*1), d; \ 379 MOVL a, T1; \ 380 ORL c, T1; \ // a|c 381 ANDL b, T1; \ //(a|c)&b 382 VPXOR XTMP3, XTMP4, XTMP4; \ // XTMP4 = x ^ (x <<< 15) (DCBA) 383 MOVL c, y2; \ 384 ANDL a, y2; \ 385 ORL y2, T1; \ // (a|c)&b | a&c 386 ADDL T1, d; \ 387 VPSLLD $23, XTMP3, XTMP5; \ // XTMP5 = XTMP3 << 23 388 ADDL y1, d; \ // d=TT1 389 ; \ 390 ADDL (wj + 3*4)(SP)(SRND*1), h; \ 391 MOVL e, y3; \ 392 ANDL f, y3; \ 393 VPSRLD $(32-23), XTMP3, XTMP3; \ // XTMP3 >> (32-23) XTMP3 still useful? 394 ANDNL g, e, y2; \ 395 ORL y2, y3; \ 396 ADDL y3, h; \ 397 VPOR XTMP3, XTMP5, XTMP5; \ // XTMP5 = x <<< 23 398 ADDL y0, h; \ // h=TT2 399 ; \ 400 RORXL $23, h, y2; \ 401 RORXL $15, h, y3; \ 402 VPXOR XTMP5, XTMP4, XTMP4; \ // XTMP4 = x ^ (x <<< 15) ^ (x <<< 23) (DCBA) 403 XORL h, y2; \ 404 ; \ 405 MOVL d, h; \ 406 XORL y2, y3; \ 407 VPXOR XTMP4, XTMP1, XDWORD0; \ // XDWORD0 = {W3, W2, W1, W0,} 408 MOVL y3, d; \ 409 ; \ 410 RORXL $23, b, b; \ 411 RORXL $13, f, f 412 413 414 #define ROUND_0_15_0(wj, wj2, flag, a, b, c, d, e, f, g, h) \ 415 MOVL e, y2; \ // y2=E 416 RORXL $20, a, y1; \ // y1=A<<<12 417 ADDL (0*4+flag*16)(TBL)(SRND*1), y2; \ // y2=E+Ti 418 ADDL y1, y2; \ // y2=(A<<<12)+E+Ti 419 RORXL $25, y2, y0; \ // y0=((A<<<12)+E+Ti)<<<7=SS1 420 XORL y0, y1; \ // y1=SS1^(A<<<12)=SS2 421 ; \ 422 ADDL (wj2 + 0*4)(SP)(SRND*1), d; \ 423 MOVL a, T1; \ 424 XORL b, T1; \ 425 XORL c, T1; \ 426 ADDL T1, d; \ 427 ADDL y1, d; \ // d=TT1 428 ; \ 429 ADDL (wj + 0*4)(SP)(SRND*1), h; \ 430 MOVL e, y3; \ 431 XORL f, y3; \ 432 XORL g, y3; \ 433 ADDL y3, h; \ 434 ADDL y0, h; \ // h=TT2 435 ; \ 436 RORXL $23, h, y2; \ 437 RORXL $15, h, y3; \ 438 XORL h, y2; \ 439 ; \ 440 MOVL d, h; \ 441 XORL y2, y3; \ 442 MOVL y3, d; \ 443 ; \ 444 RORXL $23, b, b; \ 445 RORXL $13, f, f 446 447 #define ROUND_0_15_1(wj, wj2, flag, a, b, c, d, e, f, g, h) \ 448 MOVL e, y2; \ // y2=E 449 RORXL $20, a, y1; \ // y1=A<<<12 450 ADDL (1*4+flag*16)(TBL)(SRND*1), y2; \ // y2=E+Ti 451 ADDL y1, y2; \ // y2=(A<<<12)+E+Ti 452 RORXL $25, y2, y0; \ // y0=((A<<<12)+E+Ti)<<<7=SS1 453 XORL y0, y1; \ // y1=SS1^(A<<<12)=SS2 454 ; \ 455 ADDL (wj2 + 1*4)(SP)(SRND*1), d; \ 456 MOVL a, T1; \ 457 XORL b, T1; \ 458 XORL c, T1; \ 459 ADDL T1, d; \ 460 ADDL y1, d; \ // d=TT1 461 ; \ 462 ADDL (wj + 1*4)(SP)(SRND*1), h; \ 463 MOVL e, y3; \ 464 XORL f, y3; \ 465 XORL g, y3; \ 466 ADDL y3, h; \ 467 ADDL y0, h; \ // h=TT2 468 ; \ 469 RORXL $23, h, y2; \ 470 RORXL $15, h, y3; \ 471 XORL h, y2; \ 472 ; \ 473 MOVL d, h; \ 474 XORL y2, y3; \ 475 MOVL y3, d; \ 476 ; \ 477 RORXL $23, b, b; \ 478 RORXL $13, f, f 479 480 #define ROUND_0_15_2(wj, wj2, flag, a, b, c, d, e, f, g, h) \ 481 MOVL e, y2; \ // y2=E 482 RORXL $20, a, y1; \ // y1=A<<<12 483 ADDL (2*4+flag*16)(TBL)(SRND*1), y2; \ // y2=E+Ti 484 ADDL y1, y2; \ // y2=(A<<<12)+E+Ti 485 RORXL $25, y2, y0; \ // y0=((A<<<12)+E+Ti)<<<7=SS1 486 XORL y0, y1; \ // y1=SS1^(A<<<12)=SS2 487 ; \ 488 ADDL (wj2 + 2*4)(SP)(SRND*1), d; \ 489 MOVL a, T1; \ 490 XORL b, T1; \ 491 XORL c, T1; \ 492 ADDL T1, d; \ 493 ADDL y1, d; \ // d=TT1 494 ; \ 495 ADDL (wj + 2*4)(SP)(SRND*1), h; \ 496 MOVL e, y3; \ 497 XORL f, y3; \ 498 XORL g, y3; \ 499 ADDL y3, h; \ 500 ADDL y0, h; \ // h=TT2 501 ; \ 502 RORXL $23, h, y2; \ 503 RORXL $15, h, y3; \ 504 XORL h, y2; \ 505 ; \ 506 MOVL d, h; \ 507 XORL y2, y3; \ 508 MOVL y3, d; \ 509 ; \ 510 RORXL $23, b, b; \ 511 RORXL $13, f, f 512 513 #define ROUND_0_15_3(wj, wj2, flag, a, b, c, d, e, f, g, h) \ 514 MOVL e, y2; \ // y2=E 515 RORXL $20, a, y1; \ // y1=A<<<12 516 ADDL (3*4+flag*16)(TBL)(SRND*1), y2; \ // y2=E+Ti 517 ADDL y1, y2; \ // y2=(A<<<12)+E+Ti 518 RORXL $25, y2, y0; \ // y0=((A<<<12)+E+Ti)<<<7=SS1 519 XORL y0, y1; \ // y1=SS1^(A<<<12)=SS2 520 ; \ 521 ADDL (wj2 + 3*4)(SP)(SRND*1), d; \ 522 MOVL a, T1; \ 523 XORL b, T1; \ 524 XORL c, T1; \ 525 ADDL T1, d; \ 526 ADDL y1, d; \ // d=TT1 527 ; \ 528 ADDL (wj + 3*4)(SP)(SRND*1), h; \ 529 MOVL e, y3; \ 530 XORL f, y3; \ 531 XORL g, y3; \ 532 ADDL y3, h; \ 533 ADDL y0, h; \ // h=TT2 534 ; \ 535 RORXL $23, h, y2; \ 536 RORXL $15, h, y3; \ 537 XORL h, y2; \ 538 ; \ 539 MOVL d, h; \ 540 XORL y2, y3; \ 541 MOVL y3, d; \ 542 ; \ 543 RORXL $23, b, b; \ 544 RORXL $13, f, f 545 546 547 #define ROUND_16_63_0(wj, wj2, flag, a, b, c, d, e, f, g, h) \ 548 MOVL e, y2; \ // y2=E 549 RORXL $20, a, y1; \ // y1=A<<<12 550 ADDL (0*4+flag*16)(TBL)(SRND*1), y2; \ // y2=E+Ti 551 ADDL y1, y2; \ // y2=(A<<<12)+E+Ti 552 RORXL $25, y2, y0; \ // y0=((A<<<12)+E+Ti)<<<7=SS1 553 XORL y0, y1; \ // y1=SS1^(A<<<12)=SS2 554 ; \ 555 ADDL (wj2 + 0*4)(SP)(SRND*1), d; \ 556 MOVL a, T1; \ 557 ORL c, T1; \ // a|c 558 ANDL b, T1; \ //(a|c)&b 559 MOVL c, y2; \ 560 ANDL a, y2; \ 561 ORL y2, T1; \ // (a|c)&b | a&c 562 ADDL T1, d; \ 563 ADDL y1, d; \ // d=TT1 564 ; \ 565 ADDL (wj + 0*4)(SP)(SRND*1), h; \ 566 MOVL e, y3; \ 567 ANDL f, y3; \ 568 ANDNL g, e, y2; \ 569 ORL y2, y3; \ 570 ADDL y3, h; \ 571 ADDL y0, h; \ // h=TT2 572 ; \ 573 RORXL $23, h, y2; \ 574 RORXL $15, h, y3; \ 575 XORL h, y2; \ 576 ; \ 577 MOVL d, h; \ 578 XORL y2, y3; \ 579 MOVL y3, d; \ 580 ; \ 581 RORXL $23, b, b; \ 582 RORXL $13, f, f 583 584 #define ROUND_16_63_1(wj, wj2, flag, a, b, c, d, e, f, g, h) \ 585 MOVL e, y2; \ // y2=E 586 RORXL $20, a, y1; \ // y1=A<<<12 587 ADDL (1*4+flag*16)(TBL)(SRND*1), y2; \ // y2=E+Ti 588 ADDL y1, y2; \ // y2=(A<<<12)+E+Ti 589 RORXL $25, y2, y0; \ // y0=((A<<<12)+E+Ti)<<<7=SS1 590 XORL y0, y1; \ // y1=SS1^(A<<<12)=SS2 591 ; \ 592 ADDL (wj2 + 1*4)(SP)(SRND*1), d; \ 593 MOVL a, T1; \ 594 ORL c, T1; \ // a|c 595 ANDL b, T1; \ //(a|c)&b 596 MOVL c, y2; \ 597 ANDL a, y2; \ 598 ORL y2, T1; \ // (a|c)&b | a&c 599 ADDL T1, d; \ 600 ADDL y1, d; \ // d=TT1 601 ; \ 602 ADDL (wj + 1*4)(SP)(SRND*1), h; \ 603 MOVL e, y3; \ 604 ANDL f, y3; \ 605 ANDNL g, e, y2; \ 606 ORL y2, y3; \ 607 ADDL y3, h; \ 608 ADDL y0, h; \ // h=TT2 609 ; \ 610 RORXL $23, h, y2; \ 611 RORXL $15, h, y3; \ 612 XORL h, y2; \ 613 ; \ 614 MOVL d, h; \ 615 XORL y2, y3; \ 616 MOVL y3, d; \ 617 ; \ 618 RORXL $23, b, b; \ 619 RORXL $13, f, f 620 621 #define ROUND_16_63_2(wj, wj2, flag, a, b, c, d, e, f, g, h) \ 622 MOVL e, y2; \ // y2=E 623 RORXL $20, a, y1; \ // y1=A<<<12 624 ADDL (2*4+flag*16)(TBL)(SRND*1), y2; \ // y2=E+Ti 625 ADDL y1, y2; \ // y2=(A<<<12)+E+Ti 626 RORXL $25, y2, y0; \ // y0=((A<<<12)+E+Ti)<<<7=SS1 627 XORL y0, y1; \ // y1=SS1^(A<<<12)=SS2 628 ; \ 629 ADDL (wj2 + 2*4)(SP)(SRND*1), d; \ 630 MOVL a, T1; \ 631 ORL c, T1; \ // a|c 632 ANDL b, T1; \ //(a|c)&b 633 MOVL c, y2; \ 634 ANDL a, y2; \ 635 ORL y2, T1; \ // (a|c)&b | a&c 636 ADDL T1, d; \ 637 ADDL y1, d; \ // d=TT1 638 ; \ 639 ADDL (wj + 2*4)(SP)(SRND*1), h; \ 640 MOVL e, y3; \ 641 ANDL f, y3; \ 642 ANDNL g, e, y2; \ 643 ORL y2, y3; \ 644 ADDL y3, h; \ 645 ADDL y0, h; \ // h=TT2 646 ; \ 647 RORXL $23, h, y2; \ 648 RORXL $15, h, y3; \ 649 XORL h, y2; \ 650 ; \ 651 MOVL d, h; \ 652 XORL y2, y3; \ 653 MOVL y3, d; \ 654 ; \ 655 RORXL $23, b, b; \ 656 RORXL $13, f, f 657 658 #define ROUND_16_63_3(wj, wj2, flag, a, b, c, d, e, f, g, h) \ 659 MOVL e, y2; \ // y2=E 660 RORXL $20, a, y1; \ // y1=A<<<12 661 ADDL (3*4+flag*16)(TBL)(SRND*1), y2; \ // y2=E+Ti 662 ADDL y1, y2; \ // y2=(A<<<12)+E+Ti 663 RORXL $25, y2, y0; \ // y0=((A<<<12)+E+Ti)<<<7=SS1 664 XORL y0, y1; \ // y1=SS1^(A<<<12)=SS2 665 ; \ 666 ADDL (wj2 + 3*4)(SP)(SRND*1), d; \ 667 MOVL a, T1; \ 668 ORL c, T1; \ // a|c 669 ANDL b, T1; \ //(a|c)&b 670 MOVL c, y2; \ 671 ANDL a, y2; \ 672 ORL y2, T1; \ // (a|c)&b | a&c 673 ADDL T1, d; \ 674 ADDL y1, d; \ // d=TT1 675 ; \ 676 ADDL (wj + 3*4)(SP)(SRND*1), h; \ 677 MOVL e, y3; \ 678 ANDL f, y3; \ 679 ANDNL g, e, y2; \ 680 ORL y2, y3; \ 681 ADDL y3, h; \ 682 ADDL y0, h; \ // h=TT2 683 ; \ 684 RORXL $23, h, y2; \ 685 RORXL $15, h, y3; \ 686 XORL h, y2; \ 687 ; \ 688 MOVL d, h; \ 689 XORL y2, y3; \ 690 MOVL y3, d; \ 691 ; \ 692 RORXL $23, b, b; \ 693 RORXL $13, f, f 694 695 // (68+64)*4*2+8+8+8 696 TEXT ·blockasm(SB), 0, $1080-48 697 CMPB ·useAVX2(SB), $1 698 JE avx2 699 700 avx2: 701 MOVQ dig+0(FP), CTX //dig.h 702 MOVQ p_base+8(FP), INP //Input 703 MOVQ p_len+16(FP), NUM_BYTES //INP_LEN 704 705 LEAQ -64(INP)(NUM_BYTES*1), NUM_BYTES // Pointer to the last block 706 MOVQ NUM_BYTES, _INP_END(SP) 707 708 CMPQ NUM_BYTES, INP 709 JE avx2_only_one_block 710 711 MOVL 0(CTX), a // a = H0 712 MOVL 4(CTX), b // b = H1 713 MOVL 8(CTX), c // c = H2 714 MOVL 12(CTX), d // d = H3 715 MOVL 16(CTX), e // e = H4 716 MOVL 20(CTX), f // f = H5 717 MOVL 24(CTX), g // g = H6 718 MOVL 28(CTX), h // h = H7 719 720 loop0: //load input 721 722 VMOVDQU (0*32)(INP), XTMP0 723 VMOVDQU (1*32)(INP), XTMP1 724 VMOVDQU (2*32)(INP), XTMP2 725 VMOVDQU (3*32)(INP), XTMP3 726 727 VMOVDQU flip_mask<>(SB), BYTE_FLIP_MASK 728 729 // Apply Byte Flip Mask: LE -> BE 730 VPSHUFB BYTE_FLIP_MASK, XTMP0, XTMP0 731 VPSHUFB BYTE_FLIP_MASK, XTMP1, XTMP1 732 VPSHUFB BYTE_FLIP_MASK, XTMP2, XTMP2 733 VPSHUFB BYTE_FLIP_MASK, XTMP3, XTMP3 734 735 // Transpose data into high/low parts 736 VPERM2I128 $0x20, XTMP2, XTMP0, XDWORD0 // w3, w2, w1, w0 737 VPERM2I128 $0x31, XTMP2, XTMP0, XDWORD1 // w7, w6, w5, w4 738 VPERM2I128 $0x20, XTMP3, XTMP1, XDWORD2 // w11, w10, w9, w8 739 VPERM2I128 $0x31, XTMP3, XTMP1, XDWORD3 // w15, w14, w13, w12 740 741 MOVQ $TSHF<>(SB), TBL 742 743 avx2_last_block_enter: 744 ADDQ $64, INP 745 MOVQ INP, _INP(SP) 746 XORQ SRND, SRND 747 748 loop1_1: //w16-w31 and first 16 rounds, srnd:4*32 749 750 VMOVDQU XDWORD0, (_XFER + 0*32)(SP)(SRND*1) //wj 751 VPXOR XDWORD1, XDWORD0, XDWORD4 //wj2 752 VMOVDQU XDWORD4, (_XFER + 17*32)(SP)(SRND*1) 753 ROUND_AND_SCHED_0_15_0(_XFER + 0*32, _XFER + 17*32, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 754 ROUND_AND_SCHED_0_15_1(_XFER + 0*32, _XFER + 17*32, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 755 ROUND_AND_SCHED_0_15_2(_XFER + 0*32, _XFER + 17*32, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 756 ROUND_AND_SCHED_0_15_3(_XFER + 0*32, _XFER + 17*32, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 757 758 ADDQ $32, SRND 759 760 VMOVDQU XDWORD1, (_XFER + 0*32)(SP)(SRND*1) 761 VPXOR XDWORD2, XDWORD1, XDWORD4 762 VMOVDQU XDWORD4, (_XFER + 17*32)(SP)(SRND*1) 763 ROUND_AND_SCHED_0_15_0(_XFER + 0*32, _XFER + 17*32, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 764 ROUND_AND_SCHED_0_15_1(_XFER + 0*32, _XFER + 17*32, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 765 ROUND_AND_SCHED_0_15_2(_XFER + 0*32, _XFER + 17*32, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 766 ROUND_AND_SCHED_0_15_3(_XFER + 0*32, _XFER + 17*32, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 767 768 ADDQ $32, SRND 769 770 VMOVDQU XDWORD2, (_XFER + 0*32)(SP)(SRND*1) 771 VPXOR XDWORD3, XDWORD2, XDWORD4 772 VMOVDQU XDWORD4, (_XFER + 17*32)(SP)(SRND*1) 773 ROUND_AND_SCHED_0_15_0(_XFER + 0*32, _XFER + 17*32, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 774 ROUND_AND_SCHED_0_15_1(_XFER + 0*32, _XFER + 17*32, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 775 ROUND_AND_SCHED_0_15_2(_XFER + 0*32, _XFER + 17*32, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 776 ROUND_AND_SCHED_0_15_3(_XFER + 0*32, _XFER + 17*32, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 777 778 ADDQ $32, SRND 779 780 VMOVDQU XDWORD3, (_XFER + 0*32)(SP)(SRND*1) 781 VPXOR XDWORD0, XDWORD3, XDWORD4 782 VMOVDQU XDWORD4, (_XFER + 17*32)(SP)(SRND*1) 783 ROUND_AND_SCHED_0_15_0(_XFER + 0*32, _XFER + 17*32, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 784 ROUND_AND_SCHED_0_15_1(_XFER + 0*32, _XFER + 17*32, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 785 ROUND_AND_SCHED_0_15_2(_XFER + 0*32, _XFER + 17*32, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 786 ROUND_AND_SCHED_0_15_3(_XFER + 0*32, _XFER + 17*32, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 787 ADDQ $32, SRND 788 789 loop1_2: //w32-w64, srnd 3*4*32, 将tshift(传参)摆脱srnd依赖,重写round_and_sched,减少3条addq 790 791 VMOVDQU XDWORD0, (_XFER + 0*32)(SP)(SRND*1) //wj 792 VPXOR XDWORD1, XDWORD0, XDWORD4 //wj2 793 VMOVDQU XDWORD4, (_XFER + 17*32)(SP)(SRND*1) 794 ROUND_AND_SCHED_16_63_0(_XFER + 0*32, _XFER + 17*32, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 795 ROUND_AND_SCHED_16_63_1(_XFER + 0*32, _XFER + 17*32, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 796 ROUND_AND_SCHED_16_63_2(_XFER + 0*32, _XFER + 17*32, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 797 ROUND_AND_SCHED_16_63_3(_XFER + 0*32, _XFER + 17*32, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 798 799 ADDQ $32, SRND 800 801 VMOVDQU XDWORD1, (_XFER + 0*32)(SP)(SRND*1) 802 VPXOR XDWORD2, XDWORD1, XDWORD4 803 VMOVDQU XDWORD4, (_XFER + 17*32)(SP)(SRND*1) 804 ROUND_AND_SCHED_16_63_0(_XFER + 0*32, _XFER + 17*32, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 805 ROUND_AND_SCHED_16_63_1(_XFER + 0*32, _XFER + 17*32, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 806 ROUND_AND_SCHED_16_63_2(_XFER + 0*32, _XFER + 17*32, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 807 ROUND_AND_SCHED_16_63_3(_XFER + 0*32, _XFER + 17*32, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0) 808 809 ADDQ $32, SRND 810 811 VMOVDQU XDWORD2, (_XFER + 0*32)(SP)(SRND*1) 812 VPXOR XDWORD3, XDWORD2, XDWORD4 813 VMOVDQU XDWORD4, (_XFER + 17*32)(SP)(SRND*1) 814 ROUND_AND_SCHED_16_63_0(_XFER + 0*32, _XFER + 17*32, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 815 ROUND_AND_SCHED_16_63_1(_XFER + 0*32, _XFER + 17*32, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 816 ROUND_AND_SCHED_16_63_2(_XFER + 0*32, _XFER + 17*32, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 817 ROUND_AND_SCHED_16_63_3(_XFER + 0*32, _XFER + 17*32, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1) 818 819 ADDQ $32, SRND 820 821 VMOVDQU XDWORD3, (_XFER + 0*32)(SP)(SRND*1) 822 VPXOR XDWORD0, XDWORD3, XDWORD4 823 VMOVDQU XDWORD4, (_XFER + 17*32)(SP)(SRND*1) 824 ROUND_AND_SCHED_16_63_0(_XFER + 0*32, _XFER + 17*32, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 825 ROUND_AND_SCHED_16_63_1(_XFER + 0*32, _XFER + 17*32, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 826 ROUND_AND_SCHED_16_63_2(_XFER + 0*32, _XFER + 17*32, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 827 ROUND_AND_SCHED_16_63_3(_XFER + 0*32, _XFER + 17*32, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2) 828 829 ADDQ $32, SRND 830 CMPQ SRND, $3*4*32 831 JB loop1_2 832 833 loop1_3: //w64-w67, last 16rounds and 4 msg_sched 834 835 VMOVDQU XDWORD0, (_XFER + 0*32)(SP)(SRND*1) //wj 836 VPXOR XDWORD1, XDWORD0, XDWORD4 //wj2 837 VMOVDQU XDWORD4, (_XFER + 17*32)(SP)(SRND*1) 838 ROUND_AND_SCHED_16_63_0(_XFER + 0*32, _XFER + 17*32, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 839 ROUND_AND_SCHED_16_63_1(_XFER + 0*32, _XFER + 17*32, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 840 ROUND_AND_SCHED_16_63_2(_XFER + 0*32, _XFER + 17*32, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 841 ROUND_AND_SCHED_16_63_3(_XFER + 0*32, _XFER + 17*32, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3) 842 ADDQ $32, SRND 843 844 VMOVDQU XDWORD1, (_XFER + 0*32)(SP)(SRND*1) //wj 845 VPXOR XDWORD2, XDWORD1, XDWORD4 //wj2 846 VMOVDQU XDWORD4, (_XFER + 17*32)(SP)(SRND*1) 847 ROUND_16_63_0(_XFER + 0*32, _XFER + 17*32, 0, e, f, g, h, a, b, c, d) 848 ROUND_16_63_1(_XFER + 0*32, _XFER + 17*32, 0, d, e, f, g, h, a, b, c) 849 ROUND_16_63_2(_XFER + 0*32, _XFER + 17*32, 0, c, d, e, f, g, h, a, b) 850 ROUND_16_63_3(_XFER + 0*32, _XFER + 17*32, 0, b, c, d, e, f, g, h, a) 851 ADDQ $32, SRND 852 853 VMOVDQU XDWORD2, (_XFER + 0*32)(SP)(SRND*1) //wj 854 VPXOR XDWORD3, XDWORD2, XDWORD4 //wj2 855 VMOVDQU XDWORD4, (_XFER + 17*32)(SP)(SRND*1) 856 ROUND_16_63_0(_XFER + 0*32, _XFER + 17*32, 0, a, b, c, d, e, f, g, h) 857 ROUND_16_63_1(_XFER + 0*32, _XFER + 17*32, 0, h, a, b, c, d, e, f, g) 858 ROUND_16_63_2(_XFER + 0*32, _XFER + 17*32, 0, g, h, a, b, c, d, e, f) 859 ROUND_16_63_3(_XFER + 0*32, _XFER + 17*32, 0, f, g, h, a, b, c, d, e) 860 ADDQ $32, SRND 861 862 VMOVDQU XDWORD3, (_XFER + 0*32)(SP)(SRND*1) //wj 863 VPXOR XDWORD0, XDWORD3, XDWORD4 //wj2 864 VMOVDQU XDWORD4, (_XFER + 17*32)(SP)(SRND*1) 865 ROUND_16_63_0(_XFER + 0*32, _XFER + 17*32, 0, e, f, g, h, a, b, c, d) 866 ROUND_16_63_1(_XFER + 0*32, _XFER + 17*32, 0, d, e, f, g, h, a, b, c) 867 ROUND_16_63_2(_XFER + 0*32, _XFER + 17*32, 0, c, d, e, f, g, h, a, b) 868 ROUND_16_63_3(_XFER + 0*32, _XFER + 17*32, 0, b, c, d, e, f, g, h, a) 869 ADDQ $32, SRND 870 871 MOVQ dig+0(FP), CTX //dig.h 872 MOVQ _INP(SP), INP 873 874 xorm( 0(CTX), a) 875 xorm( 4(CTX), b) 876 xorm( 8(CTX), c) 877 xorm( 12(CTX), d) 878 xorm( 16(CTX), e) 879 xorm( 20(CTX), f) 880 xorm( 24(CTX), g) 881 xorm( 28(CTX), h) 882 883 CMPQ _INP_END(SP), INP 884 JB done_hash 885 886 XORQ SRND, SRND 887 888 loop2_0: //Do second block with previously scheduled results wj/wj2 889 890 ROUND_0_15_0(_XFER + 0*32 + 16, _XFER + 17*32 + 16, 1, a, b, c, d, e, f, g, h) 891 ROUND_0_15_1(_XFER + 0*32 + 16, _XFER + 17*32 + 16, 1, h, a, b, c, d, e, f, g) 892 ROUND_0_15_2(_XFER + 0*32 + 16, _XFER + 17*32 + 16, 1, g, h, a, b, c, d, e, f) 893 ROUND_0_15_3(_XFER + 0*32 + 16, _XFER + 17*32 + 16, 1, f, g, h, a, b, c, d, e) 894 ADDQ $32, SRND 895 896 ROUND_0_15_0(_XFER + 0*32 + 16, _XFER + 17*32 + 16, 1, e, f, g, h, a, b, c, d) 897 ROUND_0_15_1(_XFER + 0*32 + 16, _XFER + 17*32 + 16, 1, d, e, f, g, h, a, b, c) 898 ROUND_0_15_2(_XFER + 0*32 + 16, _XFER + 17*32 + 16, 1, c, d, e, f, g, h, a, b) 899 ROUND_0_15_3(_XFER + 0*32 + 16, _XFER + 17*32 + 16, 1, b, c, d, e, f, g, h, a) 900 ADDQ $32, SRND 901 902 CMPQ SRND, $4*32 903 JB loop2_0 904 905 loop2_1: 906 ROUND_16_63_0(_XFER + 0*32 + 16, _XFER + 17*32 + 16, 1, a, b, c, d, e, f, g, h) 907 ROUND_16_63_1(_XFER + 0*32 + 16, _XFER + 17*32 + 16, 1, h, a, b, c, d, e, f, g) 908 ROUND_16_63_2(_XFER + 0*32 + 16, _XFER + 17*32 + 16, 1, g, h, a, b, c, d, e, f) 909 ROUND_16_63_3(_XFER + 0*32 + 16, _XFER + 17*32 + 16, 1, f, g, h, a, b, c, d, e) 910 ADDQ $32, SRND 911 912 ROUND_16_63_0(_XFER + 0*32 + 16, _XFER + 17*32 + 16, 1, e, f, g, h, a, b, c, d) 913 ROUND_16_63_1(_XFER + 0*32 + 16, _XFER + 17*32 + 16, 1, d, e, f, g, h, a, b, c) 914 ROUND_16_63_2(_XFER + 0*32 + 16, _XFER + 17*32 + 16, 1, c, d, e, f, g, h, a, b) 915 ROUND_16_63_3(_XFER + 0*32 + 16, _XFER + 17*32 + 16, 1, b, c, d, e, f, g, h, a) 916 ADDQ $32, SRND 917 918 CMPQ SRND, $4*4*32 919 JB loop2_1 920 921 MOVQ dig+0(FP), CTX //Output 922 MOVQ _INP(SP), INP 923 ADDQ $64, INP 924 925 xorm( 0(CTX), a) 926 xorm( 4(CTX), b) 927 xorm( 8(CTX), c) 928 xorm( 12(CTX), d) 929 xorm( 16(CTX), e) 930 xorm( 20(CTX), f) 931 xorm( 24(CTX), g) 932 xorm( 28(CTX), h) 933 934 CMPQ _INP_END(SP), INP 935 JA loop0 936 JB done_hash 937 938 939 avx2_do_last_block: 940 941 VMOVDQU 0(INP), XWORD0 942 VMOVDQU 16(INP), XWORD1 943 VMOVDQU 32(INP), XWORD2 944 VMOVDQU 48(INP), XWORD3 945 946 VMOVDQU flip_mask<>(SB), BYTE_FLIP_MASK 947 948 VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0 949 VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1 950 VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2 951 VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3 952 953 MOVQ $TSHF<>(SB), TBL 954 955 JMP avx2_last_block_enter 956 957 avx2_only_one_block: 958 // Load initial digest 959 MOVL 0(CTX), a // a = H0 960 MOVL 4(CTX), b // b = H1 961 MOVL 8(CTX), c // c = H2 962 MOVL 12(CTX), d // d = H3 963 MOVL 16(CTX), e // e = H4 964 MOVL 20(CTX), f // f = H5 965 MOVL 24(CTX), g // g = H6 966 MOVL 28(CTX), h // h = H7 967 968 JMP avx2_do_last_block 969 970 done_hash: 971 VZEROUPPER 972 RET 973 974 // shuffle byte order from LE to BE 975 DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203 976 DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b 977 DATA flip_mask<>+0x10(SB)/8, $0x0405060700010203 978 DATA flip_mask<>+0x18(SB)/8, $0x0c0d0e0f08090a0b 979 GLOBL flip_mask<>(SB), 8, $32 980 981 // shuffle BxAx -> 00BA 982 DATA shuff_00BA<>+0x00(SB)/8, $0x0f0e0d0c07060504 983 DATA shuff_00BA<>+0x08(SB)/8, $0xFFFFFFFFFFFFFFFF 984 DATA shuff_00BA<>+0x10(SB)/8, $0x0f0e0d0c07060504 985 DATA shuff_00BA<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF 986 GLOBL shuff_00BA<>(SB), 8, $32 987 988 //tshift for 2 blocks 989 DATA TSHF<>+0x0(SB)/4, $0x79cc4519 990 DATA TSHF<>+0x4(SB)/4, $0xf3988a32 991 DATA TSHF<>+0x8(SB)/4, $0xe7311465 992 DATA TSHF<>+0xc(SB)/4, $0xce6228cb 993 DATA TSHF<>+0x10(SB)/4, $0x79cc4519 994 DATA TSHF<>+0x14(SB)/4, $0xf3988a32 995 DATA TSHF<>+0x18(SB)/4, $0xe7311465 996 DATA TSHF<>+0x1c(SB)/4, $0xce6228cb 997 998 DATA TSHF<>+0x20(SB)/4, $0x9cc45197 999 DATA TSHF<>+0x24(SB)/4, $0x3988a32f 1000 DATA TSHF<>+0x28(SB)/4, $0x7311465e 1001 DATA TSHF<>+0x2c(SB)/4, $0xe6228cbc 1002 DATA TSHF<>+0x30(SB)/4, $0x9cc45197 1003 DATA TSHF<>+0x34(SB)/4, $0x3988a32f 1004 DATA TSHF<>+0x38(SB)/4, $0x7311465e 1005 DATA TSHF<>+0x3c(SB)/4, $0xe6228cbc 1006 1007 DATA TSHF<>+0x40(SB)/4, $0xcc451979 1008 DATA TSHF<>+0x44(SB)/4, $0x988a32f3 1009 DATA TSHF<>+0x48(SB)/4, $0x311465e7 1010 DATA TSHF<>+0x4c(SB)/4, $0x6228cbce 1011 DATA TSHF<>+0x50(SB)/4, $0xcc451979 1012 DATA TSHF<>+0x54(SB)/4, $0x988a32f3 1013 DATA TSHF<>+0x58(SB)/4, $0x311465e7 1014 DATA TSHF<>+0x5c(SB)/4, $0x6228cbce 1015 1016 DATA TSHF<>+0x60(SB)/4, $0xc451979c 1017 DATA TSHF<>+0x64(SB)/4, $0x88a32f39 1018 DATA TSHF<>+0x68(SB)/4, $0x11465e73 1019 DATA TSHF<>+0x6c(SB)/4, $0x228cbce6 1020 DATA TSHF<>+0x70(SB)/4, $0xc451979c 1021 DATA TSHF<>+0x74(SB)/4, $0x88a32f39 1022 DATA TSHF<>+0x78(SB)/4, $0x11465e73 1023 DATA TSHF<>+0x7c(SB)/4, $0x228cbce6 1024 1025 DATA TSHF<>+0x80(SB)/4, $0x9d8a7a87 1026 DATA TSHF<>+0x84(SB)/4, $0x3b14f50f 1027 DATA TSHF<>+0x88(SB)/4, $0x7629ea1e 1028 DATA TSHF<>+0x8c(SB)/4, $0xec53d43c 1029 DATA TSHF<>+0x90(SB)/4, $0x9d8a7a87 1030 DATA TSHF<>+0x94(SB)/4, $0x3b14f50f 1031 DATA TSHF<>+0x98(SB)/4, $0x7629ea1e 1032 DATA TSHF<>+0x9c(SB)/4, $0xec53d43c 1033 1034 DATA TSHF<>+0xa0(SB)/4, $0xd8a7a879 1035 DATA TSHF<>+0xa4(SB)/4, $0xb14f50f3 1036 DATA TSHF<>+0xa8(SB)/4, $0x629ea1e7 1037 DATA TSHF<>+0xac(SB)/4, $0xc53d43ce 1038 DATA TSHF<>+0xb0(SB)/4, $0xd8a7a879 1039 DATA TSHF<>+0xb4(SB)/4, $0xb14f50f3 1040 DATA TSHF<>+0xb8(SB)/4, $0x629ea1e7 1041 DATA TSHF<>+0xbc(SB)/4, $0xc53d43ce 1042 1043 DATA TSHF<>+0xc0(SB)/4, $0x8a7a879d 1044 DATA TSHF<>+0xc4(SB)/4, $0x14f50f3b 1045 DATA TSHF<>+0xc8(SB)/4, $0x29ea1e76 1046 DATA TSHF<>+0xcc(SB)/4, $0x53d43cec 1047 DATA TSHF<>+0xd0(SB)/4, $0x8a7a879d 1048 DATA TSHF<>+0xd4(SB)/4, $0x14f50f3b 1049 DATA TSHF<>+0xd8(SB)/4, $0x29ea1e76 1050 DATA TSHF<>+0xdc(SB)/4, $0x53d43cec 1051 1052 DATA TSHF<>+0xe0(SB)/4, $0xa7a879d8 1053 DATA TSHF<>+0xe4(SB)/4, $0x4f50f3b1 1054 DATA TSHF<>+0xe8(SB)/4, $0x9ea1e762 1055 DATA TSHF<>+0xec(SB)/4, $0x3d43cec5 1056 DATA TSHF<>+0xf0(SB)/4, $0xa7a879d8 1057 DATA TSHF<>+0xf4(SB)/4, $0x4f50f3b1 1058 DATA TSHF<>+0xf8(SB)/4, $0x9ea1e762 1059 DATA TSHF<>+0xfc(SB)/4, $0x3d43cec5 1060 1061 DATA TSHF<>+0x100(SB)/4, $0x7a879d8a 1062 DATA TSHF<>+0x104(SB)/4, $0xf50f3b14 1063 DATA TSHF<>+0x108(SB)/4, $0xea1e7629 1064 DATA TSHF<>+0x10c(SB)/4, $0xd43cec53 1065 DATA TSHF<>+0x110(SB)/4, $0x7a879d8a 1066 DATA TSHF<>+0x114(SB)/4, $0xf50f3b14 1067 DATA TSHF<>+0x118(SB)/4, $0xea1e7629 1068 DATA TSHF<>+0x11c(SB)/4, $0xd43cec53 1069 1070 DATA TSHF<>+0x120(SB)/4, $0xa879d8a7 1071 DATA TSHF<>+0x124(SB)/4, $0x50f3b14f 1072 DATA TSHF<>+0x128(SB)/4, $0xa1e7629e 1073 DATA TSHF<>+0x12c(SB)/4, $0x43cec53d 1074 DATA TSHF<>+0x130(SB)/4, $0xa879d8a7 1075 DATA TSHF<>+0x134(SB)/4, $0x50f3b14f 1076 DATA TSHF<>+0x138(SB)/4, $0xa1e7629e 1077 DATA TSHF<>+0x13c(SB)/4, $0x43cec53d 1078 1079 DATA TSHF<>+0x140(SB)/4, $0x879d8a7a 1080 DATA TSHF<>+0x144(SB)/4, $0xf3b14f5 1081 DATA TSHF<>+0x148(SB)/4, $0x1e7629ea 1082 DATA TSHF<>+0x14c(SB)/4, $0x3cec53d4 1083 DATA TSHF<>+0x150(SB)/4, $0x879d8a7a 1084 DATA TSHF<>+0x154(SB)/4, $0xf3b14f5 1085 DATA TSHF<>+0x158(SB)/4, $0x1e7629ea 1086 DATA TSHF<>+0x15c(SB)/4, $0x3cec53d4 1087 1088 DATA TSHF<>+0x160(SB)/4, $0x79d8a7a8 1089 DATA TSHF<>+0x164(SB)/4, $0xf3b14f50 1090 DATA TSHF<>+0x168(SB)/4, $0xe7629ea1 1091 DATA TSHF<>+0x16c(SB)/4, $0xcec53d43 1092 DATA TSHF<>+0x170(SB)/4, $0x79d8a7a8 1093 DATA TSHF<>+0x174(SB)/4, $0xf3b14f50 1094 DATA TSHF<>+0x178(SB)/4, $0xe7629ea1 1095 DATA TSHF<>+0x17c(SB)/4, $0xcec53d43 1096 1097 DATA TSHF<>+0x180(SB)/4, $0x9d8a7a87 1098 DATA TSHF<>+0x184(SB)/4, $0x3b14f50f 1099 DATA TSHF<>+0x188(SB)/4, $0x7629ea1e 1100 DATA TSHF<>+0x18c(SB)/4, $0xec53d43c 1101 DATA TSHF<>+0x190(SB)/4, $0x9d8a7a87 1102 DATA TSHF<>+0x194(SB)/4, $0x3b14f50f 1103 DATA TSHF<>+0x198(SB)/4, $0x7629ea1e 1104 DATA TSHF<>+0x19c(SB)/4, $0xec53d43c 1105 1106 DATA TSHF<>+0x1a0(SB)/4, $0xd8a7a879 1107 DATA TSHF<>+0x1a4(SB)/4, $0xb14f50f3 1108 DATA TSHF<>+0x1a8(SB)/4, $0x629ea1e7 1109 DATA TSHF<>+0x1ac(SB)/4, $0xc53d43ce 1110 DATA TSHF<>+0x1b0(SB)/4, $0xd8a7a879 1111 DATA TSHF<>+0x1b4(SB)/4, $0xb14f50f3 1112 DATA TSHF<>+0x1b8(SB)/4, $0x629ea1e7 1113 DATA TSHF<>+0x1bc(SB)/4, $0xc53d43ce 1114 1115 DATA TSHF<>+0x1c0(SB)/4, $0x8a7a879d 1116 DATA TSHF<>+0x1c4(SB)/4, $0x14f50f3b 1117 DATA TSHF<>+0x1c8(SB)/4, $0x29ea1e76 1118 DATA TSHF<>+0x1cc(SB)/4, $0x53d43cec 1119 DATA TSHF<>+0x1d0(SB)/4, $0x8a7a879d 1120 DATA TSHF<>+0x1d4(SB)/4, $0x14f50f3b 1121 DATA TSHF<>+0x1d8(SB)/4, $0x29ea1e76 1122 DATA TSHF<>+0x1dc(SB)/4, $0x53d43cec 1123 1124 DATA TSHF<>+0x1e0(SB)/4, $0xa7a879d8 1125 DATA TSHF<>+0x1e4(SB)/4, $0x4f50f3b1 1126 DATA TSHF<>+0x1e8(SB)/4, $0x9ea1e762 1127 DATA TSHF<>+0x1ec(SB)/4, $0x3d43cec5 1128 DATA TSHF<>+0x1f0(SB)/4, $0xa7a879d8 1129 DATA TSHF<>+0x1f4(SB)/4, $0x4f50f3b1 1130 DATA TSHF<>+0x1f8(SB)/4, $0x9ea1e762 1131 DATA TSHF<>+0x1fc(SB)/4, $0x3d43cec5 1132 GLOBL TSHF<>(SB), (NOPTR + RODATA), $512