github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/powerpc32/addmul_1.asm (about) 1 dnl PowerPC-32 mpn_addmul_1 -- Multiply a limb vector with a limb and add the 2 dnl result to a second limb vector. 3 4 dnl Copyright 1995, 1997, 1998, 2000-2003, 2005 Free Software Foundation, Inc. 5 6 dnl This file is part of the GNU MP Library. 7 dnl 8 dnl The GNU MP Library is free software; you can redistribute it and/or modify 9 dnl it under the terms of either: 10 dnl 11 dnl * the GNU Lesser General Public License as published by the Free 12 dnl Software Foundation; either version 3 of the License, or (at your 13 dnl option) any later version. 14 dnl 15 dnl or 16 dnl 17 dnl * the GNU General Public License as published by the Free Software 18 dnl Foundation; either version 2 of the License, or (at your option) any 19 dnl later version. 20 dnl 21 dnl or both in parallel, as here. 22 dnl 23 dnl The GNU MP Library is distributed in the hope that it will be useful, but 24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26 dnl for more details. 27 dnl 28 dnl You should have received copies of the GNU General Public License and the 29 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30 dnl see https://www.gnu.org/licenses/. 31 32 include(`../config.m4') 33 34 C cycles/limb 35 C 603e: ? 36 C 604e: 6.75 37 C 75x (G3): 8.7-14.3 38 C 7400,7410 (G4): 8.7-14.3 39 C 744x,745x (G4+): 9.5 40 C power4/ppc970: 6.25 41 C power5: 6.25 42 43 C INPUT PARAMETERS 44 C rp r3 45 C up r4 46 C n r5 47 C vl r6 48 49 C This is optimized for the PPC604. It has not been tuned for other 50 C PowerPC processors. 51 C 52 C Loop Analysis for the 604: 53 C 12 mem insn 54 C 8 serializing insn 55 C 8 int multiply 56 C 25 int reg write 57 C 9 int ops (8 of which serialize) 58 C 59 C The multiply insns need 16 cycles/4limb. 60 C The integer register writes will need 13 cycles/4limb. 61 C All-in-all, it should be possible to get to 4 or 5 cycles/limb on PPC604, 62 C but that will require some clever FPNOPS and BNOPS for exact 63 C issue control. 64 65 66 ASM_START() 67 PROLOGUE(mpn_addmul_1) 68 cmpwi cr0,r5,9 C more than 9 limbs? 69 bgt cr0,L(big) C branch if more than 9 limbs 70 71 mtctr r5 72 lwz r0,0(r4) 73 mullw r7,r0,r6 74 mulhwu r10,r0,r6 75 lwz r9,0(r3) 76 addc r8,r7,r9 77 addi r3,r3,-4 78 bdz L(end) 79 L(loop): 80 lwzu r0,4(r4) 81 stwu r8,4(r3) 82 mullw r8,r0,r6 83 adde r7,r8,r10 84 mulhwu r10,r0,r6 85 lwz r9,4(r3) 86 addze r10,r10 87 addc r8,r7,r9 88 bdnz L(loop) 89 L(end): stw r8,4(r3) 90 addze r3,r10 91 blr 92 93 L(big): stwu r1,-16(r1) 94 addi r5,r5,-1 95 stw r30,8(r1) 96 srwi r0,r5,2 97 stw r31,12(r1) 98 mtctr r0 99 100 lwz r7,0(r4) 101 mullw r8,r7,r6 102 mulhwu r0,r7,r6 103 lwz r7,0(r3) 104 addc r8,r8,r7 105 stw r8,0(r3) 106 107 L(loopU): 108 lwz r7,4(r4) 109 lwz r12,8(r4) 110 lwz r30,12(r4) 111 lwzu r31,16(r4) 112 mullw r8,r7,r6 113 mullw r9,r12,r6 114 mullw r10,r30,r6 115 mullw r11,r31,r6 116 adde r8,r8,r0 C add cy_limb 117 mulhwu r0,r7,r6 118 lwz r7,4(r3) 119 adde r9,r9,r0 120 mulhwu r0,r12,r6 121 lwz r12,8(r3) 122 adde r10,r10,r0 123 mulhwu r0,r30,r6 124 lwz r30,12(r3) 125 adde r11,r11,r0 126 mulhwu r0,r31,r6 127 lwz r31,16(r3) 128 addze r0,r0 C new cy_limb 129 addc r8,r8,r7 130 stw r8,4(r3) 131 adde r9,r9,r12 132 stw r9,8(r3) 133 adde r10,r10,r30 134 stw r10,12(r3) 135 adde r11,r11,r31 136 stwu r11,16(r3) 137 bdnz L(loopU) 138 139 andi. r31,r5,3 140 mtctr r31 141 beq cr0,L(endx) 142 143 L(loopE): 144 lwzu r7,4(r4) 145 mullw r8,r7,r6 146 adde r8,r8,r0 C add cy_limb 147 mulhwu r0,r7,r6 148 lwz r7,4(r3) 149 addze r0,r0 C new cy_limb 150 addc r8,r8,r7 151 stwu r8,4(r3) 152 bdnz L(loopE) 153 L(endx): 154 addze r3,r0 155 lwz r30,8(r1) 156 lwz r31,12(r1) 157 addi r1,r1,16 158 blr 159 EPILOGUE(mpn_addmul_1)