github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/bobcat/sqr_basecase.asm (about) 1 dnl AMD64 mpn_sqr_basecase optimised for AMD bobcat. 2 3 dnl Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 C cycles/limb 34 C AMD K8,K9 4.5 35 C AMD K10 4.5 36 C AMD bd1 4.75 37 C AMD bobcat 5 38 C Intel P4 17.7 39 C Intel core2 5.5 40 C Intel NHM 5.43 41 C Intel SBR 3.92 42 C Intel atom 23 43 C VIA nano 5.63 44 45 C This sqr_basecase is based on mul_1 and addmul_1, since these both run at the 46 C multiply insn bandwidth, without any apparent loop branch exit pipeline 47 C replays experienced on K8. The structure is unusual: it falls into mul_1 in 48 C the same way for all n, then it splits into 4 different wind-down blocks and 49 C 4 separate addmul_1 loops. 50 C 51 C We have not tried using the same addmul_1 loops with a switch into feed-in 52 C code, as we do in other basecase implementations. Doing that could save 53 C substantial code volume, but would also probably add some overhead. 54 55 C TODO 56 C * Tune un < 4 code. 57 C * Perhaps implement a larger final corner (it is now 2 x 1). 58 C * Lots of space could be saved by replacing the "switch" code by gradual 59 C jumps out from mul_1 winddown code, perhaps with no added overhead. 60 C * Are the ALIGN(16) really necessary? They add about 25 bytes of padding. 61 62 ABI_SUPPORT(DOS64) 63 ABI_SUPPORT(STD64) 64 65 C Standard parameters 66 define(`rp', `%rdi') 67 define(`up', `%rsi') 68 define(`un_param', `%rdx') 69 C Standard allocations 70 define(`un', `%rbx') 71 define(`w0', `%r8') 72 define(`w1', `%r9') 73 define(`w2', `%r10') 74 define(`w3', `%r11') 75 define(`n', `%rbp') 76 define(`v0', `%rcx') 77 78 C Temp macro for allowing control over indexing. 79 C Define to return $1 for more conservative ptr handling. 80 define(`X',`$2') 81 dnl define(`X',`$1') 82 83 84 ASM_START() 85 TEXT 86 ALIGN(64) 87 PROLOGUE(mpn_sqr_basecase) 88 FUNC_ENTRY(3) 89 90 mov (up), %rax 91 92 cmp $2, R32(un_param) 93 jae L(ge2) 94 95 mul %rax 96 mov %rax, (rp) 97 mov %rdx, 8(rp) 98 FUNC_EXIT() 99 ret 100 101 L(ge2): mov (up), v0 102 jnz L(g2) 103 104 mul %rax 105 mov %rax, (rp) 106 mov 8(up), %rax 107 mov %rdx, w0 108 mul v0 109 add %rax, w0 110 mov %rdx, w1 111 adc $0, w1 112 mov 8(up), v0 113 mov (up), %rax 114 mul v0 115 add %rax, w0 116 mov w0, 8(rp) 117 mov %rdx, w0 C CAUTION: r8 realloc 118 adc $0, w0 119 mov 8(up), %rax 120 mul v0 121 add w1, w0 122 adc $0, %rdx 123 add w0, %rax 124 adc $0, %rdx 125 mov %rax, 16(rp) 126 mov %rdx, 24(rp) 127 FUNC_EXIT() 128 ret 129 130 L(g2): cmp $3, R32(un_param) 131 ja L(g3) 132 mul %rax 133 mov %rax, (rp) 134 mov %rdx, 8(rp) 135 mov 8(up), %rax 136 mul %rax 137 mov %rax, 16(rp) 138 mov %rdx, 24(rp) 139 mov 16(up), %rax 140 mul %rax 141 mov %rax, 32(rp) 142 mov %rdx, 40(rp) 143 144 mov (up), v0 145 mov 8(up), %rax 146 mul v0 147 mov %rax, w0 148 mov %rdx, w1 149 mov 16(up), %rax 150 mul v0 151 xor R32(w2), R32(w2) 152 add %rax, w1 153 adc %rdx, w2 154 155 mov 8(up), v0 156 mov 16(up), %rax 157 mul v0 158 xor R32(w3), R32(w3) 159 add %rax, w2 160 adc %rdx, w3 161 add w0, w0 162 adc w1, w1 163 adc w2, w2 164 adc w3, w3 165 mov $0, R32(v0) 166 adc v0, v0 167 add w0, 8(rp) 168 adc w1, 16(rp) 169 adc w2, 24(rp) 170 adc w3, 32(rp) 171 adc v0, 40(rp) 172 FUNC_EXIT() 173 ret 174 175 L(g3): push %rbx 176 push %rbp 177 178 mov 8(up), %rax 179 lea -24(rp,un_param,8), rp 180 lea -24(up,un_param,8), up 181 neg un_param 182 push un_param C for sqr_diag_addlsh1 183 lea (un_param), un 184 lea 3(un_param), n 185 186 mul v0 187 mov %rax, w2 188 mov %rdx, w3 189 jmp L(L3) 190 191 ALIGN(16) 192 L(top): mov w0, -16(rp,n,8) 193 add w1, w2 194 adc $0, w3 195 mov (up,n,8), %rax 196 mul v0 197 mov %rax, w0 198 mov %rdx, w1 199 mov w2, -8(rp,n,8) 200 add w3, w0 201 adc $0, w1 202 mov 8(up,n,8), %rax 203 mul v0 204 mov %rax, w2 205 mov %rdx, w3 206 mov w0, (rp,n,8) 207 add w1, w2 208 adc $0, w3 209 L(L3): mov 16(up,n,8), %rax 210 mul v0 211 mov %rax, w0 212 mov %rdx, w1 213 mov w2, 8(rp,n,8) 214 add w3, w0 215 adc $0, w1 216 mov 24(up,n,8), %rax 217 mul v0 218 mov %rax, w2 219 mov %rdx, w3 220 add $4, n 221 js L(top) 222 223 mov w0, -16(rp,n,8) 224 add w1, w2 225 adc $0, w3 226 227 test n, n 228 jz L(r2) 229 cmp $2, R32(n) 230 ja L(r3) 231 jz L(r0) 232 233 234 L(r1): mov X((up,n,8),8(up)), %rax 235 mul v0 236 mov %rax, w0 237 mov %rdx, w1 238 mov w2, X(-8(rp,n,8),(rp)) 239 add w3, w0 240 adc $0, w1 241 mov X(8(up,n,8),16(up)), %rax 242 mul v0 243 mov %rax, w2 244 mov %rdx, w3 245 mov w0, X((rp,n,8),8(rp)) 246 add w1, w2 247 adc $0, w3 248 mov w2, X(8(rp,n,8),16(rp)) 249 mov w3, X(16(rp,n,8),24(rp)) 250 add $5, un 251 jmp L(to0) 252 253 L(r2): mov X((up,n,8),(up)), %rax 254 mul v0 255 mov %rax, w0 256 mov %rdx, w1 257 mov w2, X(-8(rp,n,8),-8(rp)) 258 add w3, w0 259 adc $0, w1 260 mov X(8(up,n,8),8(up)), %rax 261 mul v0 262 mov %rax, w2 263 mov %rdx, w3 264 mov w0, X((rp,n,8),(rp)) 265 add w1, w2 266 adc $0, w3 267 mov X(16(up,n,8),16(up)), %rax 268 mul v0 269 mov %rax, w0 270 mov %rdx, w1 271 mov w2, X(8(rp,n,8),8(rp)) 272 add w3, w0 273 adc $0, w1 274 mov w0, X(16(rp,n,8),16(rp)) 275 adc $0, w3 276 mov w1, X(24(rp,n,8),24(rp)) 277 add $6, un 278 jmp L(to1) 279 280 L(r3): mov w2, X(-8(rp,n,8),16(rp)) 281 mov w3, X((rp,n,8),24(rp)) 282 add $3, un 283 jmp L(to2) 284 285 L(r0): mov X((up,n,8),16(up)), %rax 286 mul v0 287 mov %rax, w0 288 mov %rdx, w1 289 mov w2, X(-8(rp,n,8),8(rp)) 290 add w3, w0 291 adc $0, w1 292 mov w0, X((rp,n,8),16(rp)) 293 mov w1, X(8(rp,n,8),24(rp)) 294 add $4, un 295 C jmp L(to3) 296 C fall through into main loop 297 298 299 L(outer): 300 mov un, n 301 mov (up,un,8), v0 302 mov 8(up,un,8), %rax 303 lea 8(rp), rp 304 mul v0 305 mov %rax, w2 306 mov %rdx, w3 307 jmp L(al3) 308 309 ALIGN(16) 310 L(ta3): add w0, -16(rp,n,8) 311 adc w1, w2 312 adc $0, w3 313 mov (up,n,8), %rax 314 mul v0 315 mov %rax, w0 316 mov %rdx, w1 317 add w2, -8(rp,n,8) 318 adc w3, w0 319 adc $0, w1 320 mov 8(up,n,8), %rax 321 mul v0 322 mov %rax, w2 323 mov %rdx, w3 324 add w0, (rp,n,8) 325 adc w1, w2 326 adc $0, w3 327 L(al3): mov 16(up,n,8), %rax 328 mul v0 329 mov %rax, w0 330 mov %rdx, w1 331 add w2, 8(rp,n,8) 332 adc w3, w0 333 adc $0, w1 334 mov 24(up,n,8), %rax 335 mul v0 336 mov %rax, w2 337 mov %rdx, w3 338 add $4, n 339 js L(ta3) 340 341 add w0, X(-16(rp,n,8),8(rp)) 342 adc w1, w2 343 adc $0, w3 344 add w2, X(-8(rp,n,8),16(rp)) 345 adc $0, w3 346 mov w3, X((rp,n,8),24(rp)) 347 348 349 L(to2): mov un, n 350 cmp $-4, R32(un) 351 jnc L(end) 352 add $4, un 353 mov 8(up,n,8), v0 354 mov 16(up,n,8), %rax 355 lea 8(rp), rp 356 mul v0 357 mov %rax, w0 358 mov %rdx, w1 359 jmp L(al2) 360 361 ALIGN(16) 362 L(ta2): add w0, -16(rp,n,8) 363 adc w1, w2 364 adc $0, w3 365 mov (up,n,8), %rax 366 mul v0 367 mov %rax, w0 368 mov %rdx, w1 369 add w2, -8(rp,n,8) 370 adc w3, w0 371 adc $0, w1 372 mov 8(up,n,8), %rax 373 mul v0 374 mov %rax, w2 375 mov %rdx, w3 376 add w0, (rp,n,8) 377 adc w1, w2 378 adc $0, w3 379 mov 16(up,n,8), %rax 380 mul v0 381 mov %rax, w0 382 mov %rdx, w1 383 add w2, 8(rp,n,8) 384 adc w3, w0 385 adc $0, w1 386 L(al2): mov 24(up,n,8), %rax 387 mul v0 388 mov %rax, w2 389 mov %rdx, w3 390 add $4, n 391 js L(ta2) 392 393 add w0, X(-16(rp,n,8),8(rp)) 394 adc w1, w2 395 adc $0, w3 396 add w2, X(-8(rp,n,8),16(rp)) 397 adc $0, w3 398 mov w3, X((rp,n,8),24(rp)) 399 400 401 L(to1): mov un, n 402 mov -16(up,un,8), v0 403 mov -8(up,un,8), %rax 404 lea 8(rp), rp 405 mul v0 406 mov %rax, w2 407 mov %rdx, w3 408 jmp L(al1) 409 410 ALIGN(16) 411 L(ta1): add w0, -16(rp,n,8) 412 adc w1, w2 413 adc $0, w3 414 L(al1): mov (up,n,8), %rax 415 mul v0 416 mov %rax, w0 417 mov %rdx, w1 418 add w2, -8(rp,n,8) 419 adc w3, w0 420 adc $0, w1 421 mov 8(up,n,8), %rax 422 mul v0 423 mov %rax, w2 424 mov %rdx, w3 425 add w0, (rp,n,8) 426 adc w1, w2 427 adc $0, w3 428 mov 16(up,n,8), %rax 429 mul v0 430 mov %rax, w0 431 mov %rdx, w1 432 add w2, 8(rp,n,8) 433 adc w3, w0 434 adc $0, w1 435 mov 24(up,n,8), %rax 436 mul v0 437 mov %rax, w2 438 mov %rdx, w3 439 add $4, n 440 js L(ta1) 441 442 add w0, X(-16(rp,n,8),8(rp)) 443 adc w1, w2 444 adc $0, w3 445 add w2, X(-8(rp,n,8),16(rp)) 446 adc $0, w3 447 mov w3, X((rp,n,8),24(rp)) 448 449 450 L(to0): mov un, n 451 mov -8(up,un,8), v0 452 mov (up,un,8), %rax 453 lea 8(rp), rp 454 mul v0 455 mov %rax, w0 456 mov %rdx, w1 457 jmp L(al0) 458 459 ALIGN(16) 460 L(ta0): add w0, -16(rp,n,8) 461 adc w1, w2 462 adc $0, w3 463 mov (up,n,8), %rax 464 mul v0 465 mov %rax, w0 466 mov %rdx, w1 467 add w2, -8(rp,n,8) 468 adc w3, w0 469 adc $0, w1 470 L(al0): mov 8(up,n,8), %rax 471 mul v0 472 mov %rax, w2 473 mov %rdx, w3 474 add w0, (rp,n,8) 475 adc w1, w2 476 adc $0, w3 477 mov 16(up,n,8), %rax 478 mul v0 479 mov %rax, w0 480 mov %rdx, w1 481 add w2, 8(rp,n,8) 482 adc w3, w0 483 adc $0, w1 484 mov 24(up,n,8), %rax 485 mul v0 486 mov %rax, w2 487 mov %rdx, w3 488 add $4, n 489 js L(ta0) 490 491 add w0, X(-16(rp,n,8),8(rp)) 492 adc w1, w2 493 adc $0, w3 494 add w2, X(-8(rp,n,8),16(rp)) 495 adc $0, w3 496 mov w3, X((rp,n,8),24(rp)) 497 jmp L(outer) 498 499 500 L(end): mov X(8(up,un,8),(up)), v0 501 mov X(16(up,un,8),8(up)), %rax 502 mul v0 503 mov %rax, w0 504 mov %rdx, w1 505 mov X(24(up,un,8),16(up)), %rax 506 mul v0 507 mov %rax, w2 508 mov %rdx, w3 509 add w0, X(24(rp,un,8),16(rp)) 510 adc w1, w2 511 adc $0, w3 512 add w2, X(32(rp,un,8),24(rp)) 513 adc $0, w3 514 mov X(16(up,un,8),8(up)), v0 515 mov X(24(up,un,8),16(up)), %rax 516 mul v0 517 add %rax, w3 518 mov w3, X(40(rp,un,8),32(rp)) 519 adc $0, %rdx 520 mov %rdx, X(48(rp,un,8),40(rp)) 521 522 523 C sqr_diag_addlsh1 524 525 lea 16(up), up 526 lea 40(rp), rp 527 pop n 528 lea 2(n,n), n 529 530 mov (up,n,4), %rax 531 mul %rax 532 xor R32(w2), R32(w2) 533 534 mov 8(rp,n,8), w0 535 mov %rax, (rp,n,8) 536 jmp L(lm) 537 538 ALIGN(8) 539 L(tsd): add %rbx, w0 540 adc %rax, w1 541 mov w0, -8(rp,n,8) 542 mov 8(rp,n,8), w0 543 mov w1, (rp,n,8) 544 L(lm): mov 16(rp,n,8), w1 545 adc w0, w0 546 adc w1, w1 547 lea (%rdx,w2), %rbx 548 mov 8(up,n,4), %rax 549 setc R8(w2) 550 mul %rax 551 add $2, n 552 js L(tsd) 553 554 L(esd): add %rbx, w0 555 adc %rax, w1 556 mov w0, X(-8(rp,n,8),-8(rp)) 557 mov w1, X((rp,n,8),(rp)) 558 adc w2, %rdx 559 mov %rdx, X(8(rp,n,8),8(rp)) 560 561 pop %rbp 562 pop %rbx 563 FUNC_EXIT() 564 ret 565 EPILOGUE()