github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/arm/v6/sqr_basecase.asm (about) 1 dnl ARM v6 mpn_sqr_basecase. 2 3 dnl Contributed to the GNU project by Torbjörn Granlund. 4 5 dnl Copyright 2012, 2013, 2015 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 C Code structure: 36 C 37 C 38 C m_2(0m4) m_2(2m4) m_2(1m4) m_2(3m4) 39 C | | | | 40 C | | | | 41 C | | | | 42 C \|/ \|/ \|/ \|/ 43 C ____________ ____________ 44 C / \ / \ 45 C \|/ \ \|/ \ 46 C am_2(3m4) am_2(1m4) am_2(0m4) am_2(2m4) 47 C \ /|\ \ /|\ 48 C \____________/ \____________/ 49 C \ / 50 C \ / 51 C \ / 52 C cor3 cor2 53 C \ / 54 C \ / 55 C sqr_diag_addlsh1 56 57 C TODO 58 C * Align more labels. 59 C * Further tweak counter and updates in outer loops. (This could save 60 C perhaps 5n cycles). 61 C * Avoid sub-with-lsl in outer loops. We could keep n up-shifted, then 62 C initialise loop counter i with a right shift. 63 C * Try to use fewer register. Perhaps coalesce r9 branch target and n_saved. 64 C (This could save 2-3 cycles for n > 4.) 65 C * Optimise sqr_diag_addlsh1 loop. The current code uses old-style carry 66 C propagation. 67 C * Stop loops earlier suppressing writes of upper-most rp[] values. 68 C * The addmul_2 loops here runs well on all cores, but mul_2 runs poorly 69 C particularly on Cortex-A8. 70 71 72 define(`rp', r0) 73 define(`up', r1) 74 define(`n', r2) 75 76 define(`v0', r3) 77 define(`v1', r6) 78 define(`i', r8) 79 define(`n_saved', r14) 80 define(`cya', r11) 81 define(`cyb', r12) 82 define(`u0', r7) 83 define(`u1', r9) 84 85 ASM_START() 86 PROLOGUE(mpn_sqr_basecase) 87 and r12, n, #3 88 cmp n, #4 89 addgt r12, r12, #4 90 add pc, pc, r12, lsl #2 91 nop 92 b L(4) 93 b L(1) 94 b L(2) 95 b L(3) 96 b L(0m4) 97 b L(1m4) 98 b L(2m4) 99 b L(3m4) 100 101 102 L(1m4): push {r4-r11, r14} 103 mov n_saved, n 104 sub i, n, #4 105 sub n, n, #2 106 add r10, pc, #L(am2_2m4)-.-8 107 ldm up, {v0,v1,u0} 108 sub up, up, #4 109 mov cyb, #0 110 mov r5, #0 111 umull r4, cya, v1, v0 112 str r4, [rp], #-12 113 mov r4, #0 114 b L(ko0) 115 116 L(3m4): push {r4-r11, r14} 117 mov n_saved, n 118 sub i, n, #4 119 sub n, n, #2 120 add r10, pc, #L(am2_0m4)-.-8 121 ldm up, {v0,v1,u0} 122 add up, up, #4 123 mov cyb, #0 124 mov r5, #0 125 umull r4, cya, v1, v0 126 str r4, [rp], #-4 127 mov r4, #0 128 b L(ko2) 129 130 L(2m4): push {r4-r11, r14} 131 mov n_saved, n 132 sub i, n, #4 133 sub n, n, #2 134 add r10, pc, #L(am2_3m4)-.-8 135 ldm up, {v0,v1,u1} 136 mov cyb, #0 137 mov r4, #0 138 umull r5, cya, v1, v0 139 str r5, [rp], #-8 140 mov r5, #0 141 b L(ko1) 142 143 L(0m4): push {r4-r11, r14} 144 mov n_saved, n 145 sub i, n, #4 146 sub n, n, #2 147 add r10, pc, #L(am2_1m4)-.-8 148 ldm up, {v0,v1,u1} 149 mov cyb, #0 150 mov r4, #0 151 add up, up, #8 152 umull r5, cya, v1, v0 153 str r5, [rp, #0] 154 mov r5, #0 155 156 L(top): ldr u0, [up, #4] 157 umaal r4, cya, u1, v0 158 str r4, [rp, #4] 159 mov r4, #0 160 umaal r5, cyb, u1, v1 161 L(ko2): ldr u1, [up, #8] 162 umaal r5, cya, u0, v0 163 str r5, [rp, #8] 164 mov r5, #0 165 umaal r4, cyb, u0, v1 166 L(ko1): ldr u0, [up, #12] 167 umaal r4, cya, u1, v0 168 str r4, [rp, #12] 169 mov r4, #0 170 umaal r5, cyb, u1, v1 171 L(ko0): ldr u1, [up, #16]! 172 umaal r5, cya, u0, v0 173 str r5, [rp, #16]! 174 mov r5, #0 175 umaal r4, cyb, u0, v1 176 subs i, i, #4 177 bhi L(top) 178 179 umaal r4, cya, u1, v0 180 ldr u0, [up, #4] 181 umaal r5, cyb, u1, v1 182 str r4, [rp, #4] 183 umaal r5, cya, u0, v0 184 umaal cya, cyb, u0, v1 185 str r5, [rp, #8] 186 str cya, [rp, #12] 187 str cyb, [rp, #16] 188 189 add up, up, #4 190 sub n, n, #1 191 add rp, rp, #8 192 bx r10 193 194 L(evnloop): 195 subs i, n, #6 196 sub n, n, #2 197 blt L(cor2) 198 ldm up, {v0,v1,u1} 199 add up, up, #8 200 mov cya, #0 201 mov cyb, #0 202 ldr r4, [rp, #-4] 203 umaal r4, cya, v1, v0 204 str r4, [rp, #-4] 205 ldr r4, [rp, #0] 206 207 ALIGN(16) 208 L(ua2): ldr r5, [rp, #4] 209 umaal r4, cya, u1, v0 210 ldr u0, [up, #4] 211 umaal r5, cyb, u1, v1 212 str r4, [rp, #0] 213 ldr r4, [rp, #8] 214 umaal r5, cya, u0, v0 215 ldr u1, [up, #8] 216 umaal r4, cyb, u0, v1 217 str r5, [rp, #4] 218 ldr r5, [rp, #12] 219 umaal r4, cya, u1, v0 220 ldr u0, [up, #12] 221 umaal r5, cyb, u1, v1 222 str r4, [rp, #8] 223 ldr r4, [rp, #16]! 224 umaal r5, cya, u0, v0 225 ldr u1, [up, #16]! 226 umaal r4, cyb, u0, v1 227 str r5, [rp, #-4] 228 subs i, i, #4 229 bhs L(ua2) 230 231 umaal r4, cya, u1, v0 232 umaal cya, cyb, u1, v1 233 str r4, [rp, #0] 234 str cya, [rp, #4] 235 str cyb, [rp, #8] 236 L(am2_0m4): 237 sub rp, rp, n, lsl #2 238 sub up, up, n, lsl #2 239 add rp, rp, #8 240 241 sub i, n, #4 242 sub n, n, #2 243 ldm up, {v0,v1,u1} 244 mov cya, #0 245 mov cyb, #0 246 ldr r4, [rp, #4] 247 umaal r4, cya, v1, v0 248 str r4, [rp, #4] 249 ldr r4, [rp, #8] 250 b L(lo0) 251 252 ALIGN(16) 253 L(ua0): ldr r5, [rp, #4] 254 umaal r4, cya, u1, v0 255 ldr u0, [up, #4] 256 umaal r5, cyb, u1, v1 257 str r4, [rp, #0] 258 ldr r4, [rp, #8] 259 umaal r5, cya, u0, v0 260 ldr u1, [up, #8] 261 umaal r4, cyb, u0, v1 262 str r5, [rp, #4] 263 L(lo0): ldr r5, [rp, #12] 264 umaal r4, cya, u1, v0 265 ldr u0, [up, #12] 266 umaal r5, cyb, u1, v1 267 str r4, [rp, #8] 268 ldr r4, [rp, #16]! 269 umaal r5, cya, u0, v0 270 ldr u1, [up, #16]! 271 umaal r4, cyb, u0, v1 272 str r5, [rp, #-4] 273 subs i, i, #4 274 bhs L(ua0) 275 276 umaal r4, cya, u1, v0 277 umaal cya, cyb, u1, v1 278 str r4, [rp, #0] 279 str cya, [rp, #4] 280 str cyb, [rp, #8] 281 L(am2_2m4): 282 sub rp, rp, n, lsl #2 283 sub up, up, n, lsl #2 284 add rp, rp, #16 285 b L(evnloop) 286 287 288 L(oddloop): 289 sub i, n, #5 290 sub n, n, #2 291 ldm up, {v0,v1,u0} 292 mov cya, #0 293 mov cyb, #0 294 ldr r5, [rp, #0] 295 umaal r5, cya, v1, v0 296 str r5, [rp, #0] 297 ldr r5, [rp, #4] 298 add up, up, #4 299 b L(lo1) 300 301 ALIGN(16) 302 L(ua1): ldr r5, [rp, #4] 303 umaal r4, cya, u1, v0 304 ldr u0, [up, #4] 305 umaal r5, cyb, u1, v1 306 str r4, [rp, #0] 307 L(lo1): ldr r4, [rp, #8] 308 umaal r5, cya, u0, v0 309 ldr u1, [up, #8] 310 umaal r4, cyb, u0, v1 311 str r5, [rp, #4] 312 ldr r5, [rp, #12] 313 umaal r4, cya, u1, v0 314 ldr u0, [up, #12] 315 umaal r5, cyb, u1, v1 316 str r4, [rp, #8] 317 ldr r4, [rp, #16]! 318 umaal r5, cya, u0, v0 319 ldr u1, [up, #16]! 320 umaal r4, cyb, u0, v1 321 str r5, [rp, #-4] 322 subs i, i, #4 323 bhs L(ua1) 324 325 umaal r4, cya, u1, v0 326 umaal cya, cyb, u1, v1 327 str r4, [rp, #0] 328 str cya, [rp, #4] 329 str cyb, [rp, #8] 330 L(am2_3m4): 331 sub rp, rp, n, lsl #2 332 sub up, up, n, lsl #2 333 add rp, rp, #4 334 335 subs i, n, #3 336 beq L(cor3) 337 sub n, n, #2 338 ldm up, {v0,v1,u0} 339 mov cya, #0 340 mov cyb, #0 341 ldr r5, [rp, #8] 342 sub up, up, #4 343 umaal r5, cya, v1, v0 344 str r5, [rp, #8] 345 ldr r5, [rp, #12] 346 b L(lo3) 347 348 ALIGN(16) 349 L(ua3): ldr r5, [rp, #4] 350 umaal r4, cya, u1, v0 351 ldr u0, [up, #4] 352 umaal r5, cyb, u1, v1 353 str r4, [rp, #0] 354 ldr r4, [rp, #8] 355 umaal r5, cya, u0, v0 356 ldr u1, [up, #8] 357 umaal r4, cyb, u0, v1 358 str r5, [rp, #4] 359 ldr r5, [rp, #12] 360 umaal r4, cya, u1, v0 361 ldr u0, [up, #12] 362 umaal r5, cyb, u1, v1 363 str r4, [rp, #8] 364 L(lo3): ldr r4, [rp, #16]! 365 umaal r5, cya, u0, v0 366 ldr u1, [up, #16]! 367 umaal r4, cyb, u0, v1 368 str r5, [rp, #-4] 369 subs i, i, #4 370 bhs L(ua3) 371 372 umaal r4, cya, u1, v0 373 umaal cya, cyb, u1, v1 374 str r4, [rp, #0] 375 str cya, [rp, #4] 376 str cyb, [rp, #8] 377 L(am2_1m4): 378 sub rp, rp, n, lsl #2 379 sub up, up, n, lsl #2 380 add rp, rp, #12 381 b L(oddloop) 382 383 384 L(cor3):ldm up, {v0,v1,u0} 385 ldr r5, [rp, #8] 386 mov cya, #0 387 mov cyb, #0 388 umaal r5, cya, v1, v0 389 str r5, [rp, #8] 390 ldr r5, [rp, #12] 391 ldr r4, [rp, #16] 392 umaal r5, cya, u0, v0 393 ldr u1, [up, #12] 394 umaal r4, cyb, u0, v1 395 str r5, [rp, #12] 396 umaal r4, cya, u1, v0 397 umaal cya, cyb, u1, v1 398 str r4, [rp, #16] 399 str cya, [rp, #20] 400 str cyb, [rp, #24] 401 add up, up, #16 402 mov cya, cyb 403 adds rp, rp, #36 C clear cy 404 mov cyb, #0 405 umaal cya, cyb, u1, u0 406 b L(sqr_diag_addlsh1) 407 408 L(cor2): 409 ldm up!, {v0,v1,u0} 410 mov r4, cya 411 mov r5, cyb 412 mov cya, #0 413 umaal r4, cya, v1, v0 414 mov cyb, #0 415 umaal r5, cya, u0, v0 416 strd r4, r5, [rp, #-4] 417 umaal cya, cyb, u0, v1 418 add rp, rp, #16 419 C b L(sqr_diag_addlsh1) 420 421 422 define(`w0', r6) 423 define(`w1', r7) 424 define(`w2', r8) 425 define(`rbx', r9) 426 427 L(sqr_diag_addlsh1): 428 str cya, [rp, #-12] 429 str cyb, [rp, #-8] 430 sub n, n_saved, #1 431 sub up, up, n_saved, lsl #2 432 sub rp, rp, n_saved, lsl #3 433 ldr r3, [up], #4 434 umull w1, r5, r3, r3 435 mov w2, #0 436 mov r10, #0 437 C cmn r0, #0 C clear cy (already clear) 438 b L(lm) 439 440 L(tsd): adds w0, w0, rbx 441 adcs w1, w1, r4 442 str w0, [rp, #0] 443 L(lm): ldr w0, [rp, #4] 444 str w1, [rp, #4] 445 ldr w1, [rp, #8]! 446 add rbx, r5, w2 447 adcs w0, w0, w0 448 ldr r3, [up], #4 449 adcs w1, w1, w1 450 adc w2, r10, r10 451 umull r4, r5, r3, r3 452 subs n, n, #1 453 bne L(tsd) 454 455 adds w0, w0, rbx 456 adcs w1, w1, r4 457 adc w2, r5, w2 458 stm rp, {w0,w1,w2} 459 460 pop {r4-r11, pc} 461 462 463 C Straight line code for n <= 4 464 465 L(1): ldr r3, [up, #0] 466 umull r1, r2, r3, r3 467 stm rp, {r1,r2} 468 bx r14 469 470 L(2): push {r4-r5} 471 ldm up, {r5,r12} 472 umull r1, r2, r5, r5 473 umull r3, r4, r12, r12 474 umull r5, r12, r5, r12 475 adds r5, r5, r5 476 adcs r12, r12, r12 477 adc r4, r4, #0 478 adds r2, r2, r5 479 adcs r3, r3, r12 480 adc r4, r4, #0 481 stm rp, {r1,r2,r3,r4} 482 pop {r4-r5} 483 bx r14 484 485 L(3): push {r4-r11} 486 ldm up, {r7,r8,r9} 487 umull r1, r2, r7, r7 488 umull r3, r4, r8, r8 489 umull r5, r6, r9, r9 490 umull r10, r11, r7, r8 491 mov r12, #0 492 umlal r11, r12, r7, r9 493 mov r7, #0 494 umlal r12, r7, r8, r9 495 adds r10, r10, r10 496 adcs r11, r11, r11 497 adcs r12, r12, r12 498 adcs r7, r7, r7 499 adc r6, r6, #0 500 adds r2, r2, r10 501 adcs r3, r3, r11 502 adcs r4, r4, r12 503 adcs r5, r5, r7 504 adc r6, r6, #0 505 stm rp, {r1,r2,r3,r4,r5,r6} 506 pop {r4-r11} 507 bx r14 508 509 L(4): push {r4-r11, r14} 510 ldm up, {r9,r10,r11,r12} 511 umull r1, r2, r9, r9 512 umull r3, r4, r10, r10 513 umull r5, r6, r11, r11 514 umull r7, r8, r12, r12 515 stm rp, {r1,r2,r3,r4,r5,r6,r7} 516 umull r1, r2, r9, r10 517 mov r3, #0 518 umlal r2, r3, r9, r11 519 mov r4, #0 520 umlal r3, r4, r9, r12 521 mov r5, #0 522 umlal r3, r5, r10, r11 523 umaal r4, r5, r10, r12 524 mov r6, #0 525 umlal r5, r6, r11, r12 526 adds r1, r1, r1 527 adcs r2, r2, r2 528 adcs r3, r3, r3 529 adcs r4, r4, r4 530 adcs r5, r5, r5 531 adcs r6, r6, r6 532 add rp, rp, #4 533 adc r7, r8, #0 534 ldm rp, {r8,r9,r10,r11,r12,r14} 535 adds r1, r1, r8 536 adcs r2, r2, r9 537 adcs r3, r3, r10 538 adcs r4, r4, r11 539 adcs r5, r5, r12 540 adcs r6, r6, r14 541 adc r7, r7, #0 542 stm rp, {r1,r2,r3,r4,r5,r6,r7} 543 pop {r4-r11, pc} 544 EPILOGUE()