github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/alpha/ev6/mul_1.asm (about) 1 dnl Alpha ev6 mpn_mul_1 -- Multiply a limb vector with a limb and store the 2 dnl result in a second limb vector. 3 4 dnl Copyright 2000, 2001, 2005 Free Software Foundation, Inc. 5 6 dnl This file is part of the GNU MP Library. 7 dnl 8 dnl The GNU MP Library is free software; you can redistribute it and/or modify 9 dnl it under the terms of either: 10 dnl 11 dnl * the GNU Lesser General Public License as published by the Free 12 dnl Software Foundation; either version 3 of the License, or (at your 13 dnl option) any later version. 14 dnl 15 dnl or 16 dnl 17 dnl * the GNU General Public License as published by the Free Software 18 dnl Foundation; either version 2 of the License, or (at your option) any 19 dnl later version. 20 dnl 21 dnl or both in parallel, as here. 22 dnl 23 dnl The GNU MP Library is distributed in the hope that it will be useful, but 24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26 dnl for more details. 27 dnl 28 dnl You should have received copies of the GNU General Public License and the 29 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30 dnl see https://www.gnu.org/licenses/. 31 32 include(`../config.m4') 33 34 C INPUT PARAMETERS 35 C res_ptr r16 36 C s1_ptr r17 37 C size r18 38 C s2_limb r19 39 40 C This code runs at 2.25 cycles/limb on EV6. 41 42 C This code was written in close cooperation with ev6 pipeline expert 43 C Steve Root. Any errors are tege's fault, though. 44 45 C Code structure: 46 47 C code for n < 8 48 C code for n > 8 code for (n mod 8) 49 C code for (n div 8) feed-in code 50 C 8-way unrolled loop 51 C wind-down code 52 53 C Some notes about unrolled loop: 54 C 55 C r1-r8 multiplies and workup 56 C r21-r28 multiplies and workup 57 C r9-r12 loads 58 C r0 -1 59 C r20,r29,r13-r15 scramble 60 C 61 C We're doing 7 of the 8 carry propagations with a br fixup code and 1 with a 62 C put-the-carry-into-hi. The idea is that these branches are very rarely 63 C taken, and since a non-taken branch consumes no resources, that is better 64 C than an addq. 65 C 66 C Software pipeline: a load in cycle #09, feeds a mul in cycle #16, feeds an 67 C add NEXT cycle #09 which feeds a store in NEXT cycle #02 68 69 C The code could use some further work: 70 C 1. Speed up really small multiplies. The default alpha/mul_1.asm code is 71 C faster than this for size < 3. 72 C 2. Improve feed-in code, perhaps with the equivalent of switch(n%8) unless 73 C that is too costly. 74 C 3. Consider using 4-way unrolling, even if that runs slower. 75 C 4. Reduce register usage. In particular, try to avoid using r29. 76 77 ASM_START() 78 PROLOGUE(mpn_mul_1) 79 cmpult r18, 8, r1 80 beq r1, $Large 81 $Lsmall: 82 ldq r2,0(r17) C r2 = s1_limb 83 lda r18,-1(r18) C size-- 84 mulq r2,r19,r3 C r3 = prod_low 85 bic r31,r31,r4 C clear cy_limb 86 umulh r2,r19,r0 C r0 = prod_high 87 beq r18,$Le1a C jump if size was == 1 88 ldq r2,8(r17) C r2 = s1_limb 89 lda r18,-1(r18) C size-- 90 stq r3,0(r16) 91 beq r18,$Le2a C jump if size was == 2 92 ALIGN(8) 93 $Lopa: mulq r2,r19,r3 C r3 = prod_low 94 addq r4,r0,r0 C cy_limb = cy_limb + 'cy' 95 lda r18,-1(r18) C size-- 96 umulh r2,r19,r4 C r4 = cy_limb 97 ldq r2,16(r17) C r2 = s1_limb 98 lda r17,8(r17) C s1_ptr++ 99 addq r3,r0,r3 C r3 = cy_limb + prod_low 100 stq r3,8(r16) 101 cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) 102 lda r16,8(r16) C res_ptr++ 103 bne r18,$Lopa 104 105 $Le2a: mulq r2,r19,r3 C r3 = prod_low 106 addq r4,r0,r0 C cy_limb = cy_limb + 'cy' 107 umulh r2,r19,r4 C r4 = cy_limb 108 addq r3,r0,r3 C r3 = cy_limb + prod_low 109 cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) 110 stq r3,8(r16) 111 addq r4,r0,r0 C cy_limb = prod_high + cy 112 ret r31,(r26),1 113 $Le1a: stq r3,0(r16) 114 ret r31,(r26),1 115 116 $Large: 117 lda r30, -224(r30) 118 stq r26, 0(r30) 119 stq r9, 8(r30) 120 stq r10, 16(r30) 121 stq r11, 24(r30) 122 stq r12, 32(r30) 123 stq r13, 40(r30) 124 stq r14, 48(r30) 125 stq r15, 56(r30) 126 stq r29, 64(r30) 127 128 and r18, 7, r20 C count for the first loop, 0-7 129 srl r18, 3, r18 C count for unrolled loop 130 bis r31, r31, r21 131 beq r20, $L_8_or_more C skip first loop 132 133 $L_9_or_more: 134 ldq r2,0(r17) C r2 = s1_limb 135 lda r17,8(r17) C s1_ptr++ 136 lda r20,-1(r20) C size-- 137 mulq r2,r19,r3 C r3 = prod_low 138 umulh r2,r19,r21 C r21 = prod_high 139 beq r20,$Le1b C jump if size was == 1 140 bis r31, r31, r0 C FIXME: shouldn't need this 141 ldq r2,0(r17) C r2 = s1_limb 142 lda r17,8(r17) C s1_ptr++ 143 lda r20,-1(r20) C size-- 144 stq r3,0(r16) 145 lda r16,8(r16) C res_ptr++ 146 beq r20,$Le2b C jump if size was == 2 147 ALIGN(8) 148 $Lopb: mulq r2,r19,r3 C r3 = prod_low 149 addq r21,r0,r0 C cy_limb = cy_limb + 'cy' 150 lda r20,-1(r20) C size-- 151 umulh r2,r19,r21 C r21 = prod_high 152 ldq r2,0(r17) C r2 = s1_limb 153 lda r17,8(r17) C s1_ptr++ 154 addq r3,r0,r3 C r3 = cy_limb + prod_low 155 stq r3,0(r16) 156 cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) 157 lda r16,8(r16) C res_ptr++ 158 bne r20,$Lopb 159 160 $Le2b: mulq r2,r19,r3 C r3 = prod_low 161 addq r21,r0,r0 C cy_limb = cy_limb + 'cy' 162 umulh r2,r19,r21 C r21 = prod_high 163 addq r3,r0,r3 C r3 = cy_limb + prod_low 164 cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) 165 stq r3,0(r16) 166 lda r16,8(r16) C res_ptr++ 167 addq r21,r0,r21 C cy_limb = prod_high + cy 168 br r31, $L_8_or_more 169 $Le1b: stq r3,0(r16) 170 lda r16,8(r16) C res_ptr++ 171 172 $L_8_or_more: 173 lda r0, -1(r31) C put -1 in r0, for tricky loop control 174 lda r17, -32(r17) C L1 bookkeeping 175 lda r18, -1(r18) C decrement count 176 177 ldq r9, 32(r17) C L1 178 ldq r10, 40(r17) C L1 179 mulq r9, r19, r22 C U1 #07 180 ldq r11, 48(r17) C L1 181 umulh r9, r19, r23 C U1 #08 182 ldq r12, 56(r17) C L1 183 mulq r10, r19, r24 C U1 #09 184 ldq r9, 64(r17) C L1 185 186 lda r17, 64(r17) C L1 bookkeeping 187 188 umulh r10, r19, r25 C U1 #11 189 mulq r11, r19, r26 C U1 #12 190 umulh r11, r19, r27 C U1 #13 191 mulq r12, r19, r28 C U1 #14 192 ldq r10, 8(r17) C L1 193 umulh r12, r19, r1 C U1 #15 194 ldq r11, 16(r17) C L1 195 mulq r9, r19, r2 C U1 #16 196 ldq r12, 24(r17) C L1 197 umulh r9, r19, r3 C U1 #17 198 addq r21, r22, r13 C L1 mov 199 mulq r10, r19, r4 C U1 #18 200 addq r23, r24, r22 C L0 sum 2 mul's 201 cmpult r13, r21, r14 C L1 carry from sum 202 bgt r18, $L_16_or_more 203 204 cmpult r22, r24, r24 C U0 carry from sum 205 umulh r10, r19, r5 C U1 #02 206 addq r25, r26, r23 C U0 sum 2 mul's 207 mulq r11, r19, r6 C U1 #03 208 cmpult r23, r26, r25 C U0 carry from sum 209 umulh r11, r19, r7 C U1 #04 210 addq r27, r28, r28 C U0 sum 2 mul's 211 mulq r12, r19, r8 C U1 #05 212 cmpult r28, r27, r15 C L0 carry from sum 213 lda r16, 32(r16) C L1 bookkeeping 214 addq r13, r31, r13 C U0 start carry cascade 215 umulh r12, r19, r21 C U1 #06 216 br r31, $ret0c 217 218 $L_16_or_more: 219 C --------------------------------------------------------------- 220 subq r18,1,r18 221 cmpult r22, r24, r24 C U0 carry from sum 222 ldq r9, 32(r17) C L1 223 224 umulh r10, r19, r5 C U1 #02 225 addq r25, r26, r23 C U0 sum 2 mul's 226 mulq r11, r19, r6 C U1 #03 227 cmpult r23, r26, r25 C U0 carry from sum 228 umulh r11, r19, r7 C U1 #04 229 addq r27, r28, r28 C U0 sum 2 mul's 230 mulq r12, r19, r8 C U1 #05 231 cmpult r28, r27, r15 C L0 carry from sum 232 lda r16, 32(r16) C L1 bookkeeping 233 addq r13, r31, r13 C U0 start carry cascade 234 235 umulh r12, r19, r21 C U1 #06 236 C beq r13, $fix0w C U0 237 $ret0w: addq r22, r14, r26 C L0 238 ldq r10, 40(r17) C L1 239 240 mulq r9, r19, r22 C U1 #07 241 beq r26, $fix1w C U0 242 $ret1w: addq r23, r24, r27 C L0 243 ldq r11, 48(r17) C L1 244 245 umulh r9, r19, r23 C U1 #08 246 beq r27, $fix2w C U0 247 $ret2w: addq r28, r25, r28 C L0 248 ldq r12, 56(r17) C L1 249 250 mulq r10, r19, r24 C U1 #09 251 beq r28, $fix3w C U0 252 $ret3w: addq r1, r2, r20 C L0 sum 2 mul's 253 ldq r9, 64(r17) C L1 254 255 addq r3, r4, r2 C L0 #10 2 mul's 256 lda r17, 64(r17) C L1 bookkeeping 257 cmpult r20, r1, r29 C U0 carry from sum 258 259 umulh r10, r19, r25 C U1 #11 260 cmpult r2, r4, r4 C U0 carry from sum 261 stq r13, -32(r16) C L0 262 stq r26, -24(r16) C L1 263 264 mulq r11, r19, r26 C U1 #12 265 addq r5, r6, r14 C U0 sum 2 mul's 266 stq r27, -16(r16) C L0 267 stq r28, -8(r16) C L1 268 269 umulh r11, r19, r27 C U1 #13 270 cmpult r14, r6, r3 C U0 carry from sum 271 C could do cross-jumping here: 272 C bra $L_middle_of_unrolled_loop 273 mulq r12, r19, r28 C U1 #14 274 addq r7, r3, r5 C L0 eat carry 275 addq r20, r15, r20 C U0 carry cascade 276 ldq r10, 8(r17) C L1 277 278 umulh r12, r19, r1 C U1 #15 279 beq r20, $fix4 C U0 280 $ret4w: addq r2, r29, r6 C L0 281 ldq r11, 16(r17) C L1 282 283 mulq r9, r19, r2 C U1 #16 284 beq r6, $fix5 C U0 285 $ret5w: addq r14, r4, r7 C L0 286 ldq r12, 24(r17) C L1 287 288 umulh r9, r19, r3 C U1 #17 289 beq r7, $fix6 C U0 290 $ret6w: addq r5, r8, r8 C L0 sum 2 291 addq r21, r22, r13 C L1 sum 2 mul's 292 293 mulq r10, r19, r4 C U1 #18 294 addq r23, r24, r22 C L0 sum 2 mul's 295 cmpult r13, r21, r14 C L1 carry from sum 296 ble r18, $Lend C U0 297 C --------------------------------------------------------------- 298 ALIGN(16) 299 $Loop: 300 umulh r0, r18, r18 C U1 #01 decrement r18! 301 cmpult r8, r5, r29 C L0 carry from last bunch 302 cmpult r22, r24, r24 C U0 carry from sum 303 ldq r9, 32(r17) C L1 304 305 umulh r10, r19, r5 C U1 #02 306 addq r25, r26, r23 C U0 sum 2 mul's 307 stq r20, 0(r16) C L0 308 stq r6, 8(r16) C L1 309 310 mulq r11, r19, r6 C U1 #03 311 cmpult r23, r26, r25 C U0 carry from sum 312 stq r7, 16(r16) C L0 313 stq r8, 24(r16) C L1 314 315 umulh r11, r19, r7 C U1 #04 316 bis r31, r31, r31 C L0 st slosh 317 bis r31, r31, r31 C L1 st slosh 318 addq r27, r28, r28 C U0 sum 2 mul's 319 320 mulq r12, r19, r8 C U1 #05 321 cmpult r28, r27, r15 C L0 carry from sum 322 lda r16, 64(r16) C L1 bookkeeping 323 addq r13, r29, r13 C U0 start carry cascade 324 325 umulh r12, r19, r21 C U1 #06 326 beq r13, $fix0 C U0 327 $ret0: addq r22, r14, r26 C L0 328 ldq r10, 40(r17) C L1 329 330 mulq r9, r19, r22 C U1 #07 331 beq r26, $fix1 C U0 332 $ret1: addq r23, r24, r27 C L0 333 ldq r11, 48(r17) C L1 334 335 umulh r9, r19, r23 C U1 #08 336 beq r27, $fix2 C U0 337 $ret2: addq r28, r25, r28 C L0 338 ldq r12, 56(r17) C L1 339 340 mulq r10, r19, r24 C U1 #09 341 beq r28, $fix3 C U0 342 $ret3: addq r1, r2, r20 C L0 sum 2 mul's 343 ldq r9, 64(r17) C L1 344 345 addq r3, r4, r2 C L0 #10 2 mul's 346 bis r31, r31, r31 C U1 mul hole 347 lda r17, 64(r17) C L1 bookkeeping 348 cmpult r20, r1, r29 C U0 carry from sum 349 350 umulh r10, r19, r25 C U1 #11 351 cmpult r2, r4, r4 C U0 carry from sum 352 stq r13, -32(r16) C L0 353 stq r26, -24(r16) C L1 354 355 mulq r11, r19, r26 C U1 #12 356 addq r5, r6, r14 C U0 sum 2 mul's 357 stq r27, -16(r16) C L0 358 stq r28, -8(r16) C L1 359 360 umulh r11, r19, r27 C U1 #13 361 bis r31, r31, r31 C L0 st slosh 362 bis r31, r31, r31 C L1 st slosh 363 cmpult r14, r6, r3 C U0 carry from sum 364 $L_middle_of_unrolled_loop: 365 mulq r12, r19, r28 C U1 #14 366 addq r7, r3, r5 C L0 eat carry 367 addq r20, r15, r20 C U0 carry cascade 368 ldq r10, 8(r17) C L1 369 370 umulh r12, r19, r1 C U1 #15 371 beq r20, $fix4 C U0 372 $ret4: addq r2, r29, r6 C L0 373 ldq r11, 16(r17) C L1 374 375 mulq r9, r19, r2 C U1 #16 376 beq r6, $fix5 C U0 377 $ret5: addq r14, r4, r7 C L0 378 ldq r12, 24(r17) C L1 379 380 umulh r9, r19, r3 C U1 #17 381 beq r7, $fix6 C U0 382 $ret6: addq r5, r8, r8 C L0 sum 2 383 addq r21, r22, r13 C L1 sum 2 mul's 384 385 mulq r10, r19, r4 C U1 #18 386 addq r23, r24, r22 C L0 sum 2 mul's 387 cmpult r13, r21, r14 C L1 carry from sum 388 bgt r18, $Loop C U0 389 C --------------------------------------------------------------- 390 $Lend: 391 cmpult r8, r5, r29 C L0 carry from last bunch 392 cmpult r22, r24, r24 C U0 carry from sum 393 394 umulh r10, r19, r5 C U1 #02 395 addq r25, r26, r23 C U0 sum 2 mul's 396 stq r20, 0(r16) C L0 397 stq r6, 8(r16) C L1 398 399 mulq r11, r19, r6 C U1 #03 400 cmpult r23, r26, r25 C U0 carry from sum 401 stq r7, 16(r16) C L0 402 stq r8, 24(r16) C L1 403 404 umulh r11, r19, r7 C U1 #04 405 addq r27, r28, r28 C U0 sum 2 mul's 406 407 mulq r12, r19, r8 C U1 #05 408 cmpult r28, r27, r15 C L0 carry from sum 409 lda r16, 64(r16) C L1 bookkeeping 410 addq r13, r29, r13 C U0 start carry cascade 411 412 umulh r12, r19, r21 C U1 #06 413 beq r13, $fix0c C U0 414 $ret0c: addq r22, r14, r26 C L0 415 beq r26, $fix1c C U0 416 $ret1c: addq r23, r24, r27 C L0 417 beq r27, $fix2c C U0 418 $ret2c: addq r28, r25, r28 C L0 419 beq r28, $fix3c C U0 420 $ret3c: addq r1, r2, r20 C L0 sum 2 mul's 421 addq r3, r4, r2 C L0 #10 2 mul's 422 lda r17, 64(r17) C L1 bookkeeping 423 cmpult r20, r1, r29 C U0 carry from sum 424 cmpult r2, r4, r4 C U0 carry from sum 425 stq r13, -32(r16) C L0 426 stq r26, -24(r16) C L1 427 addq r5, r6, r14 C U0 sum 2 mul's 428 stq r27, -16(r16) C L0 429 stq r28, -8(r16) C L1 430 cmpult r14, r6, r3 C U0 carry from sum 431 addq r7, r3, r5 C L0 eat carry 432 addq r20, r15, r20 C U0 carry cascade 433 beq r20, $fix4c C U0 434 $ret4c: addq r2, r29, r6 C L0 435 beq r6, $fix5c C U0 436 $ret5c: addq r14, r4, r7 C L0 437 beq r7, $fix6c C U0 438 $ret6c: addq r5, r8, r8 C L0 sum 2 439 cmpult r8, r5, r29 C L0 carry from last bunch 440 stq r20, 0(r16) C L0 441 stq r6, 8(r16) C L1 442 stq r7, 16(r16) C L0 443 stq r8, 24(r16) C L1 444 addq r29, r21, r0 445 446 ldq r26, 0(r30) 447 ldq r9, 8(r30) 448 ldq r10, 16(r30) 449 ldq r11, 24(r30) 450 ldq r12, 32(r30) 451 ldq r13, 40(r30) 452 ldq r14, 48(r30) 453 ldq r15, 56(r30) 454 ldq r29, 64(r30) 455 lda r30, 224(r30) 456 ret r31, (r26), 1 457 458 C $fix0w: bis r14, r29, r14 C join carries 459 C br r31, $ret0w 460 $fix1w: bis r24, r14, r24 C join carries 461 br r31, $ret1w 462 $fix2w: bis r25, r24, r25 C join carries 463 br r31, $ret2w 464 $fix3w: bis r15, r25, r15 C join carries 465 br r31, $ret3w 466 $fix0: bis r14, r29, r14 C join carries 467 br r31, $ret0 468 $fix1: bis r24, r14, r24 C join carries 469 br r31, $ret1 470 $fix2: bis r25, r24, r25 C join carries 471 br r31, $ret2 472 $fix3: bis r15, r25, r15 C join carries 473 br r31, $ret3 474 $fix4: bis r29, r15, r29 C join carries 475 br r31, $ret4 476 $fix5: bis r4, r29, r4 C join carries 477 br r31, $ret5 478 $fix6: addq r5, r4, r5 C can't carry twice! 479 br r31, $ret6 480 $fix0c: bis r14, r29, r14 C join carries 481 br r31, $ret0c 482 $fix1c: bis r24, r14, r24 C join carries 483 br r31, $ret1c 484 $fix2c: bis r25, r24, r25 C join carries 485 br r31, $ret2c 486 $fix3c: bis r15, r25, r15 C join carries 487 br r31, $ret3c 488 $fix4c: bis r29, r15, r29 C join carries 489 br r31, $ret4c 490 $fix5c: bis r4, r29, r4 C join carries 491 br r31, $ret5c 492 $fix6c: addq r5, r4, r5 C can't carry twice! 493 br r31, $ret6c 494 495 EPILOGUE(mpn_mul_1) 496 ASM_END()