github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/alpha/ev6/sub_n.asm (about) 1 dnl Alpha ev6 mpn_sub_n -- Subtract two limb vectors of the same length > 0 2 dnl and store difference in a third limb vector. 3 4 dnl Copyright 2000, 2003, 2005 Free Software Foundation, Inc. 5 6 dnl This file is part of the GNU MP Library. 7 dnl 8 dnl The GNU MP Library is free software; you can redistribute it and/or modify 9 dnl it under the terms of either: 10 dnl 11 dnl * the GNU Lesser General Public License as published by the Free 12 dnl Software Foundation; either version 3 of the License, or (at your 13 dnl option) any later version. 14 dnl 15 dnl or 16 dnl 17 dnl * the GNU General Public License as published by the Free Software 18 dnl Foundation; either version 2 of the License, or (at your option) any 19 dnl later version. 20 dnl 21 dnl or both in parallel, as here. 22 dnl 23 dnl The GNU MP Library is distributed in the hope that it will be useful, but 24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26 dnl for more details. 27 dnl 28 dnl You should have received copies of the GNU General Public License and the 29 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30 dnl see https://www.gnu.org/licenses/. 31 32 include(`../config.m4') 33 34 C cycles/limb 35 C EV4: ? 36 C EV5: 5.4 37 C EV6: 2.125 38 39 C INPUT PARAMETERS 40 C rp r16 41 C up r17 42 C vp r18 43 C n r19 44 C cy r20 (for mpn_add_nc) 45 46 C TODO 47 C Finish cleaning up cy registers r22, r23 (make them use cy0/cy1) 48 C Use multi-pronged feed-in. 49 C Perform additional micro-tuning 50 51 C This code was written in cooperation with ev6 pipeline expert Steve Root. 52 53 C Pair loads and stores where possible 54 C Store pairs oct-aligned where possible (didn't need it here) 55 C Stores are delayed every third cycle 56 C Loads and stores are delayed by fills 57 C U stays still, put code there where possible (note alternation of U1 and U0) 58 C L moves because of loads and stores 59 C Note dampers in L to limit damage 60 61 C This odd-looking optimization expects that were having random bits in our 62 C data, so that a pure zero result is unlikely. so we penalize the unlikely 63 C case to help the common case. 64 65 define(`u0', `r0') define(`u1', `r3') 66 define(`v0', `r1') define(`v1', `r4') 67 68 define(`cy0', `r20') define(`cy1', `r21') 69 70 MULFUNC_PROLOGUE(mpn_sub_n mpn_sub_nc) 71 72 ASM_START() 73 PROLOGUE(mpn_sub_nc) 74 br r31, $entry 75 EPILOGUE() 76 PROLOGUE(mpn_sub_n) 77 bis r31, r31, cy0 C clear carry in 78 $entry: cmpult r19, 5, r22 C L1 move counter 79 ldq u1, 0(r17) C L0 get next ones 80 ldq v1, 0(r18) C L1 81 bne r22, $Lsmall 82 83 ldq u0, 8(r17) C L0 get next ones 84 ldq v0, 8(r18) C L1 85 subq u1, v1, r5 C U0 sub two data 86 87 cmpult u1, v1, r23 C U0 did it borrow 88 ldq u1, 16(r17) C L0 get next ones 89 ldq v1, 16(r18) C L1 90 91 subq u0, v0, r8 C U1 sub two data 92 subq r5, cy0, r24 C U0 borrow in 93 94 cmpult u0, v0, r22 C U1 did it borrow 95 beq r5, $fix5f C U0 fix exact zero 96 $ret5f: ldq u0, 24(r17) C L0 get next ones 97 ldq v0, 24(r18) C L1 98 99 subq r8, r23, r25 C U1 borrow from last 100 subq u1, v1, r7 C U0 sub two data 101 102 beq r8, $fix6f C U1 fix exact zero 103 $ret6f: cmpult u1, v1, r23 C U0 did it borrow 104 ldq u1, 32(r17) C L0 get next ones 105 ldq v1, 32(r18) C L1 106 107 lda r17, 40(r17) C L0 move pointer 108 lda r18, 40(r18) C L1 move pointer 109 110 lda r16, -8(r16) 111 lda r19, -13(r19) C L1 move counter 112 blt r19, $Lend C U1 loop control 113 114 115 C Main loop. 8-way unrolled. 116 ALIGN(16) 117 $Loop: subq u0, v0, r2 C U1 sub two data 118 stq r24, 8(r16) C L0 put an answer 119 subq r7, r22, r24 C U0 borrow from last 120 stq r25, 16(r16) C L1 pair 121 122 cmpult u0, v0, cy1 C U1 did it borrow 123 beq r7, $fix7 C U0 fix exact 0 124 $ret7: ldq u0, 0(r17) C L0 get next ones 125 ldq v0, 0(r18) C L1 126 127 bis r31, r31, r31 C L damp out 128 subq r2, r23, r25 C U1 borrow from last 129 bis r31, r31, r31 C L moves in L ! 130 subq u1, v1, r5 C U0 sub two data 131 132 beq r2, $fix0 C U1 fix exact zero 133 $ret0: cmpult u1, v1, cy0 C U0 did it borrow 134 ldq u1, 8(r17) C L0 get next ones 135 ldq v1, 8(r18) C L1 136 137 subq u0, v0, r8 C U1 sub two data 138 stq r24, 24(r16) C L0 store pair 139 subq r5, cy1, r24 C U0 borrow from last 140 stq r25, 32(r16) C L1 141 142 cmpult u0, v0, r22 C U1 did it borrow 143 beq r5, $fix1 C U0 fix exact zero 144 $ret1: ldq u0, 16(r17) C L0 get next ones 145 ldq v0, 16(r18) C L1 146 147 lda r16, 64(r16) C L0 move pointer 148 subq r8, cy0, r25 C U1 borrow from last 149 lda r19, -8(r19) C L1 move counter 150 subq u1, v1, r7 C U0 sub two data 151 152 beq r8, $fix2 C U1 fix exact zero 153 $ret2: cmpult u1, v1, r23 C U0 did it borrow 154 ldq u1, 24(r17) C L0 get next ones 155 ldq v1, 24(r18) C L1 156 157 subq u0, v0, r2 C U1 sub two data 158 stq r24, -24(r16) C L0 put an answer 159 subq r7, r22, r24 C U0 borrow from last 160 stq r25, -16(r16) C L1 pair 161 162 cmpult u0, v0, cy1 C U1 did it borrow 163 beq r7, $fix3 C U0 fix exact 0 164 $ret3: ldq u0, 32(r17) C L0 get next ones 165 ldq v0, 32(r18) C L1 166 167 bis r31, r31, r31 C L damp out 168 subq r2, r23, r25 C U1 borrow from last 169 bis r31, r31, r31 C L moves in L ! 170 subq u1, v1, r5 C U0 sub two data 171 172 beq r2, $fix4 C U1 fix exact zero 173 $ret4: cmpult u1, v1, cy0 C U0 did it borrow 174 ldq u1, 40(r17) C L0 get next ones 175 ldq v1, 40(r18) C L1 176 177 subq u0, v0, r8 C U1 sub two data 178 stq r24, -8(r16) C L0 store pair 179 subq r5, cy1, r24 C U0 borrow from last 180 stq r25, 0(r16) C L1 181 182 cmpult u0, v0, r22 C U1 did it borrow 183 beq r5, $fix5 C U0 fix exact zero 184 $ret5: ldq u0, 48(r17) C L0 get next ones 185 ldq v0, 48(r18) C L1 186 187 ldl r31, 256(r17) C L0 prefetch 188 subq r8, cy0, r25 C U1 borrow from last 189 ldl r31, 256(r18) C L1 prefetch 190 subq u1, v1, r7 C U0 sub two data 191 192 beq r8, $fix6 C U1 fix exact zero 193 $ret6: cmpult u1, v1, r23 C U0 did it borrow 194 ldq u1, 56(r17) C L0 get next ones 195 ldq v1, 56(r18) C L1 196 197 lda r17, 64(r17) C L0 move pointer 198 bis r31, r31, r31 C U 199 lda r18, 64(r18) C L1 move pointer 200 bge r19, $Loop C U1 loop control 201 C ==== main loop end 202 203 $Lend: subq u0, v0, r2 C U1 sub two data 204 stq r24, 8(r16) C L0 put an answer 205 subq r7, r22, r24 C U0 borrow from last 206 stq r25, 16(r16) C L1 pair 207 cmpult u0, v0, cy1 C U1 did it borrow 208 beq r7, $fix7c C U0 fix exact 0 209 $ret7c: subq r2, r23, r25 C U1 borrow from last 210 subq u1, v1, r5 C U0 sub two data 211 beq r2, $fix0c C U1 fix exact zero 212 $ret0c: cmpult u1, v1, cy0 C U0 did it borrow 213 stq r24, 24(r16) C L0 store pair 214 subq r5, cy1, r24 C U0 borrow from last 215 stq r25, 32(r16) C L1 216 beq r5, $fix1c C U0 fix exact zero 217 $ret1c: stq r24, 40(r16) C L0 put an answer 218 lda r16, 48(r16) C L0 move pointer 219 220 lda r19, 8(r19) 221 beq r19, $Lret 222 223 ldq u1, 0(r17) 224 ldq v1, 0(r18) 225 $Lsmall: 226 lda r19, -1(r19) 227 beq r19, $Lend0 228 229 ALIGN(8) 230 $Loop0: subq u1, v1, r2 C main sub 231 cmpult u1, v1, r8 C compute bw from last sub 232 ldq u1, 8(r17) 233 ldq v1, 8(r18) 234 subq r2, cy0, r5 C borrow sub 235 lda r17, 8(r17) 236 lda r18, 8(r18) 237 stq r5, 0(r16) 238 cmpult r2, cy0, cy0 C compute bw from last sub 239 lda r19, -1(r19) C decr loop cnt 240 bis r8, cy0, cy0 C combine bw from the two subs 241 lda r16, 8(r16) 242 bne r19, $Loop0 243 $Lend0: subq u1, v1, r2 C main sub 244 subq r2, cy0, r5 C borrow sub 245 cmpult u1, v1, r8 C compute bw from last sub 246 cmpult r2, cy0, cy0 C compute bw from last sub 247 stq r5, 0(r16) 248 bis r8, cy0, r0 C combine bw from the two subs 249 ret r31,(r26),1 250 251 ALIGN(8) 252 $Lret: lda r0, 0(cy0) C copy borrow into return register 253 ret r31,(r26),1 254 255 $fix5f: bis r23, cy0, r23 C bring forward borrow 256 br r31, $ret5f 257 $fix6f: bis r22, r23, r22 C bring forward borrow 258 br r31, $ret6f 259 $fix0: bis cy1, r23, cy1 C bring forward borrow 260 br r31, $ret0 261 $fix1: bis cy0, cy1, cy0 C bring forward borrow 262 br r31, $ret1 263 $fix2: bis r22, cy0, r22 C bring forward borrow 264 br r31, $ret2 265 $fix3: bis r23, r22, r23 C bring forward borrow 266 br r31, $ret3 267 $fix4: bis cy1, r23, cy1 C bring forward borrow 268 br r31, $ret4 269 $fix5: bis cy1, cy0, cy0 C bring forward borrow 270 br r31, $ret5 271 $fix6: bis r22, cy0, r22 C bring forward borrow 272 br r31, $ret6 273 $fix7: bis r23, r22, r23 C bring forward borrow 274 br r31, $ret7 275 $fix0c: bis cy1, r23, cy1 C bring forward borrow 276 br r31, $ret0c 277 $fix1c: bis cy0, cy1, cy0 C bring forward borrow 278 br r31, $ret1c 279 $fix7c: bis r23, r22, r23 C bring forward borrow 280 br r31, $ret7c 281 282 EPILOGUE() 283 ASM_END()