github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/atom/sse2/mul_basecase.asm (about) 1 dnl x86 mpn_mul_basecase -- Multiply two limb vectors and store the result in 2 dnl a third limb vector. 3 4 dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. 5 6 dnl Copyright 2011 Free Software Foundation, Inc. 7 8 dnl This file is part of the GNU MP Library. 9 dnl 10 dnl The GNU MP Library is free software; you can redistribute it and/or modify 11 dnl it under the terms of either: 12 dnl 13 dnl * the GNU Lesser General Public License as published by the Free 14 dnl Software Foundation; either version 3 of the License, or (at your 15 dnl option) any later version. 16 dnl 17 dnl or 18 dnl 19 dnl * the GNU General Public License as published by the Free Software 20 dnl Foundation; either version 2 of the License, or (at your option) any 21 dnl later version. 22 dnl 23 dnl or both in parallel, as here. 24 dnl 25 dnl The GNU MP Library is distributed in the hope that it will be useful, but 26 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 27 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 28 dnl for more details. 29 dnl 30 dnl You should have received copies of the GNU General Public License and the 31 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 32 dnl see https://www.gnu.org/licenses/. 33 34 include(`../config.m4') 35 36 C TODO 37 C * Check if 'jmp N(%esp)' is well-predicted enough to allow us to combine the 38 C 4 large loops into one; we could use it for the outer loop branch. 39 C * Optimise code outside of inner loops. 40 C * Write combined addmul_1 feed-in a wind-down code, and use when iterating 41 C outer each loop. ("Overlapping software pipelining") 42 C * Postpone push of ebx until we know vn > 1. Perhaps use caller-saves regs 43 C for inlined mul_1, allowing us to postpone all pushes. 44 C * Perhaps write special code for vn <= un < M, for some small M. 45 46 C void mpn_mul_basecase (mp_ptr wp, 47 C mp_srcptr xp, mp_size_t xn, 48 C mp_srcptr yp, mp_size_t yn); 49 C 50 51 define(`rp', `%edi') 52 define(`up', `%esi') 53 define(`un', `%ecx') 54 define(`vp', `%ebp') 55 define(`vn', `36(%esp)') 56 57 TEXT 58 ALIGN(16) 59 PROLOGUE(mpn_mul_basecase) 60 push %edi 61 push %esi 62 push %ebx 63 push %ebp 64 mov 20(%esp), rp 65 mov 24(%esp), up 66 mov 28(%esp), un 67 mov 32(%esp), vp 68 69 movd (up), %mm0 70 movd (vp), %mm7 71 pmuludq %mm7, %mm0 72 pxor %mm6, %mm6 73 74 mov un, %eax 75 and $3, %eax 76 jz L(of0) 77 cmp $2, %eax 78 jc L(of1) 79 jz L(of2) 80 81 C ================================================================ 82 jmp L(m3) 83 ALIGN(16) 84 L(lm3): movd -4(up), %mm0 85 pmuludq %mm7, %mm0 86 psrlq $32, %mm6 87 lea 16(rp), rp 88 paddq %mm0, %mm6 89 movd (up), %mm0 90 pmuludq %mm7, %mm0 91 movd %mm6, -4(rp) 92 psrlq $32, %mm6 93 L(m3): paddq %mm0, %mm6 94 movd 4(up), %mm0 95 pmuludq %mm7, %mm0 96 movd %mm6, (rp) 97 psrlq $32, %mm6 98 paddq %mm0, %mm6 99 movd 8(up), %mm0 100 pmuludq %mm7, %mm0 101 movd %mm6, 4(rp) 102 psrlq $32, %mm6 103 paddq %mm0, %mm6 104 sub $4, un 105 movd %mm6, 8(rp) 106 lea 16(up), up 107 ja L(lm3) 108 109 psrlq $32, %mm6 110 movd %mm6, 12(rp) 111 112 decl vn 113 jz L(done) 114 lea -8(rp), rp 115 116 L(ol3): mov 28(%esp), un 117 neg un 118 lea 4(vp), vp 119 movd (vp), %mm7 C read next V limb 120 mov 24(%esp), up 121 lea 16(rp,un,4), rp 122 123 movd (up), %mm0 124 pmuludq %mm7, %mm0 125 sar $2, un 126 movd 4(up), %mm1 127 movd %mm0, %ebx 128 pmuludq %mm7, %mm1 129 lea -8(up), up 130 xor %edx, %edx C zero edx and CF 131 jmp L(a3) 132 133 L(la3): movd 4(up), %mm1 134 adc $0, %edx 135 add %eax, 12(rp) 136 movd %mm0, %ebx 137 pmuludq %mm7, %mm1 138 lea 16(rp), rp 139 psrlq $32, %mm0 140 adc %edx, %ebx 141 movd %mm0, %edx 142 movd %mm1, %eax 143 movd 8(up), %mm0 144 pmuludq %mm7, %mm0 145 adc $0, %edx 146 add %ebx, (rp) 147 psrlq $32, %mm1 148 adc %edx, %eax 149 movd %mm1, %edx 150 movd %mm0, %ebx 151 movd 12(up), %mm1 152 pmuludq %mm7, %mm1 153 adc $0, %edx 154 add %eax, 4(rp) 155 L(a3): psrlq $32, %mm0 156 adc %edx, %ebx 157 movd %mm0, %edx 158 movd %mm1, %eax 159 lea 16(up), up 160 movd (up), %mm0 161 adc $0, %edx 162 add %ebx, 8(rp) 163 psrlq $32, %mm1 164 adc %edx, %eax 165 movd %mm1, %edx 166 pmuludq %mm7, %mm0 167 inc un 168 jnz L(la3) 169 170 adc un, %edx C un is zero here 171 add %eax, 12(rp) 172 movd %mm0, %ebx 173 psrlq $32, %mm0 174 adc %edx, %ebx 175 movd %mm0, %eax 176 adc un, %eax 177 add %ebx, 16(rp) 178 adc un, %eax 179 mov %eax, 20(rp) 180 181 decl vn 182 jnz L(ol3) 183 jmp L(done) 184 185 C ================================================================ 186 ALIGN(16) 187 L(lm0): movd (up), %mm0 188 pmuludq %mm7, %mm0 189 psrlq $32, %mm6 190 lea 16(rp), rp 191 L(of0): paddq %mm0, %mm6 192 movd 4(up), %mm0 193 pmuludq %mm7, %mm0 194 movd %mm6, (rp) 195 psrlq $32, %mm6 196 paddq %mm0, %mm6 197 movd 8(up), %mm0 198 pmuludq %mm7, %mm0 199 movd %mm6, 4(rp) 200 psrlq $32, %mm6 201 paddq %mm0, %mm6 202 movd 12(up), %mm0 203 pmuludq %mm7, %mm0 204 movd %mm6, 8(rp) 205 psrlq $32, %mm6 206 paddq %mm0, %mm6 207 sub $4, un 208 movd %mm6, 12(rp) 209 lea 16(up), up 210 ja L(lm0) 211 212 psrlq $32, %mm6 213 movd %mm6, 16(rp) 214 215 decl vn 216 jz L(done) 217 lea -4(rp), rp 218 219 L(ol0): mov 28(%esp), un 220 neg un 221 lea 4(vp), vp 222 movd (vp), %mm7 C read next V limb 223 mov 24(%esp), up 224 lea 20(rp,un,4), rp 225 226 movd (up), %mm1 227 pmuludq %mm7, %mm1 228 sar $2, un 229 movd 4(up), %mm0 230 lea -4(up), up 231 movd %mm1, %eax 232 pmuludq %mm7, %mm0 233 xor %edx, %edx C zero edx and CF 234 jmp L(a0) 235 236 L(la0): movd 4(up), %mm1 237 adc $0, %edx 238 add %eax, 12(rp) 239 movd %mm0, %ebx 240 pmuludq %mm7, %mm1 241 lea 16(rp), rp 242 psrlq $32, %mm0 243 adc %edx, %ebx 244 movd %mm0, %edx 245 movd %mm1, %eax 246 movd 8(up), %mm0 247 pmuludq %mm7, %mm0 248 adc $0, %edx 249 add %ebx, (rp) 250 L(a0): psrlq $32, %mm1 251 adc %edx, %eax 252 movd %mm1, %edx 253 movd %mm0, %ebx 254 movd 12(up), %mm1 255 pmuludq %mm7, %mm1 256 adc $0, %edx 257 add %eax, 4(rp) 258 psrlq $32, %mm0 259 adc %edx, %ebx 260 movd %mm0, %edx 261 movd %mm1, %eax 262 lea 16(up), up 263 movd (up), %mm0 264 adc $0, %edx 265 add %ebx, 8(rp) 266 psrlq $32, %mm1 267 adc %edx, %eax 268 movd %mm1, %edx 269 pmuludq %mm7, %mm0 270 inc un 271 jnz L(la0) 272 273 adc un, %edx C un is zero here 274 add %eax, 12(rp) 275 movd %mm0, %ebx 276 psrlq $32, %mm0 277 adc %edx, %ebx 278 movd %mm0, %eax 279 adc un, %eax 280 add %ebx, 16(rp) 281 adc un, %eax 282 mov %eax, 20(rp) 283 284 decl vn 285 jnz L(ol0) 286 jmp L(done) 287 288 C ================================================================ 289 ALIGN(16) 290 L(lm1): movd -12(up), %mm0 291 pmuludq %mm7, %mm0 292 psrlq $32, %mm6 293 lea 16(rp), rp 294 paddq %mm0, %mm6 295 movd -8(up), %mm0 296 pmuludq %mm7, %mm0 297 movd %mm6, -12(rp) 298 psrlq $32, %mm6 299 paddq %mm0, %mm6 300 movd -4(up), %mm0 301 pmuludq %mm7, %mm0 302 movd %mm6, -8(rp) 303 psrlq $32, %mm6 304 paddq %mm0, %mm6 305 movd (up), %mm0 306 pmuludq %mm7, %mm0 307 movd %mm6, -4(rp) 308 psrlq $32, %mm6 309 L(of1): paddq %mm0, %mm6 310 sub $4, un 311 movd %mm6, (rp) 312 lea 16(up), up 313 ja L(lm1) 314 315 psrlq $32, %mm6 316 movd %mm6, 4(rp) 317 318 decl vn 319 jz L(done) 320 lea -16(rp), rp 321 322 L(ol1): mov 28(%esp), un 323 neg un 324 lea 4(vp), vp 325 movd (vp), %mm7 C read next V limb 326 mov 24(%esp), up 327 lea 24(rp,un,4), rp 328 329 movd (up), %mm0 330 pmuludq %mm7, %mm0 331 sar $2, un 332 movd %mm0, %ebx 333 movd 4(up), %mm1 334 pmuludq %mm7, %mm1 335 xor %edx, %edx C zero edx and CF 336 inc un 337 jmp L(a1) 338 339 L(la1): movd 4(up), %mm1 340 adc $0, %edx 341 add %eax, 12(rp) 342 movd %mm0, %ebx 343 pmuludq %mm7, %mm1 344 lea 16(rp), rp 345 L(a1): psrlq $32, %mm0 346 adc %edx, %ebx 347 movd %mm0, %edx 348 movd %mm1, %eax 349 movd 8(up), %mm0 350 pmuludq %mm7, %mm0 351 adc $0, %edx 352 add %ebx, (rp) 353 psrlq $32, %mm1 354 adc %edx, %eax 355 movd %mm1, %edx 356 movd %mm0, %ebx 357 movd 12(up), %mm1 358 pmuludq %mm7, %mm1 359 adc $0, %edx 360 add %eax, 4(rp) 361 psrlq $32, %mm0 362 adc %edx, %ebx 363 movd %mm0, %edx 364 movd %mm1, %eax 365 lea 16(up), up 366 movd (up), %mm0 367 adc $0, %edx 368 add %ebx, 8(rp) 369 psrlq $32, %mm1 370 adc %edx, %eax 371 movd %mm1, %edx 372 pmuludq %mm7, %mm0 373 inc un 374 jnz L(la1) 375 376 adc un, %edx C un is zero here 377 add %eax, 12(rp) 378 movd %mm0, %ebx 379 psrlq $32, %mm0 380 adc %edx, %ebx 381 movd %mm0, %eax 382 adc un, %eax 383 add %ebx, 16(rp) 384 adc un, %eax 385 mov %eax, 20(rp) 386 387 decl vn 388 jnz L(ol1) 389 jmp L(done) 390 391 C ================================================================ 392 ALIGN(16) 393 L(lm2): movd -8(up), %mm0 394 pmuludq %mm7, %mm0 395 psrlq $32, %mm6 396 lea 16(rp), rp 397 paddq %mm0, %mm6 398 movd -4(up), %mm0 399 pmuludq %mm7, %mm0 400 movd %mm6, -8(rp) 401 psrlq $32, %mm6 402 paddq %mm0, %mm6 403 movd (up), %mm0 404 pmuludq %mm7, %mm0 405 movd %mm6, -4(rp) 406 psrlq $32, %mm6 407 L(of2): paddq %mm0, %mm6 408 movd 4(up), %mm0 409 pmuludq %mm7, %mm0 410 movd %mm6, (rp) 411 psrlq $32, %mm6 412 paddq %mm0, %mm6 413 sub $4, un 414 movd %mm6, 4(rp) 415 lea 16(up), up 416 ja L(lm2) 417 418 psrlq $32, %mm6 419 movd %mm6, 8(rp) 420 421 decl vn 422 jz L(done) 423 lea -12(rp), rp 424 425 L(ol2): mov 28(%esp), un 426 neg un 427 lea 4(vp), vp 428 movd (vp), %mm7 C read next V limb 429 mov 24(%esp), up 430 lea 12(rp,un,4), rp 431 432 movd (up), %mm1 433 pmuludq %mm7, %mm1 434 sar $2, un 435 movd 4(up), %mm0 436 lea 4(up), up 437 movd %mm1, %eax 438 xor %edx, %edx C zero edx and CF 439 jmp L(lo2) 440 441 L(la2): movd 4(up), %mm1 442 adc $0, %edx 443 add %eax, 12(rp) 444 movd %mm0, %ebx 445 pmuludq %mm7, %mm1 446 lea 16(rp), rp 447 psrlq $32, %mm0 448 adc %edx, %ebx 449 movd %mm0, %edx 450 movd %mm1, %eax 451 movd 8(up), %mm0 452 pmuludq %mm7, %mm0 453 adc $0, %edx 454 add %ebx, (rp) 455 psrlq $32, %mm1 456 adc %edx, %eax 457 movd %mm1, %edx 458 movd %mm0, %ebx 459 movd 12(up), %mm1 460 pmuludq %mm7, %mm1 461 adc $0, %edx 462 add %eax, 4(rp) 463 psrlq $32, %mm0 464 adc %edx, %ebx 465 movd %mm0, %edx 466 movd %mm1, %eax 467 lea 16(up), up 468 movd (up), %mm0 469 adc $0, %edx 470 add %ebx, 8(rp) 471 L(lo2): psrlq $32, %mm1 472 adc %edx, %eax 473 movd %mm1, %edx 474 pmuludq %mm7, %mm0 475 inc un 476 jnz L(la2) 477 478 adc un, %edx C un is zero here 479 add %eax, 12(rp) 480 movd %mm0, %ebx 481 psrlq $32, %mm0 482 adc %edx, %ebx 483 movd %mm0, %eax 484 adc un, %eax 485 add %ebx, 16(rp) 486 adc un, %eax 487 mov %eax, 20(rp) 488 489 decl vn 490 jnz L(ol2) 491 C jmp L(done) 492 493 C ================================================================ 494 L(done): 495 emms 496 pop %ebp 497 pop %ebx 498 pop %esi 499 pop %edi 500 ret 501 EPILOGUE()