github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/ia64/divrem_2.asm (about) 1 dnl IA-64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number. 2 3 dnl Copyright 2010, 2013 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 C norm frac 34 C itanium 1 35 C itanium 2 29 29 36 37 38 C TODO 39 C * Inline and interleave limb inversion code with loop setup code. 40 C * We should use explicit bundling in much of the code, since it typically 41 C cuts some cycles with the GNU assembler. 42 43 44 ASM_START() 45 46 C HP's assembler requires these declarations for importing mpn_invert_limb 47 .global mpn_invert_limb 48 .type mpn_invert_limb,@function 49 50 C INPUT PARAMETERS 51 C qp = r32 52 C fn = r33 53 C np = r34 54 C nn = r35 55 C dp = r36 56 57 define(`f0x1', `f15') 58 59 ASM_START() 60 PROLOGUE(mpn_divrem_2) 61 .prologue 62 ifdef(`HAVE_ABI_32', 63 ` addp4 r32 = 0, r32 C M I 64 addp4 r34 = 0, r34 C M I 65 zxt4 r35 = r35 C I 66 addp4 r36 = 0, r36 C M I 67 nop.m 0 68 zxt4 r33 = r33 C I 69 ;; 70 ') 71 .save ar.pfs, r42 72 alloc r42 = ar.pfs, 5, 9, 1, 0 73 shladd r34 = r35, 3, r34 74 adds r14 = 8, r36 75 mov r43 = r1 76 ;; 77 adds r15 = -8, r34 78 ld8 r39 = [r14] 79 .save ar.lc, r45 80 mov r45 = ar.lc 81 adds r14 = -16, r34 82 mov r40 = r0 83 adds r34 = -24, r34 84 ;; 85 ld8 r38 = [r15] 86 .save rp, r41 87 mov r41 = b0 88 .body 89 ld8 r36 = [r36] 90 ld8 r37 = [r14] 91 ;; 92 cmp.gtu p6, p7 = r39, r38 93 (p6) br.cond.dptk .L8 94 ;; 95 cmp.leu p8, p9 = r36, r37 96 cmp.geu p6, p7 = r39, r38 97 ;; 98 (p8) cmp4.ne.and.orcm p6, p7 = 0, r0 99 (p7) br.cond.dptk .L51 100 .L8: 101 add r14 = r33, r35 // un + fn 102 mov r46 = r39 // argument to mpn_invert_limb 103 ;; 104 adds r35 = -3, r14 105 ;; 106 cmp.gt p12, p0 = r0, r35 107 (p12) br.cond.dpnt L(end) 108 br.call.sptk.many b0 = mpn_invert_limb 109 ;; 110 setf.sig f11 = r8 // di (non-final) 111 setf.sig f34 = r39 // d1 112 setf.sig f33 = r36 // d0 113 mov r1 = r43 114 ;; 115 mov r17 = 1 116 setf.sig f9 = r38 // n2 117 xma.l f6 = f11, f34, f0 // t0 = LO(di * d1) 118 ;; 119 setf.sig f10 = r37 // n1 120 setf.sig f15 = r17 // 1 121 xma.hu f8 = f11, f33, f0 // s0 = HI(di * d0) 122 ;; 123 getf.sig r17 = f6 124 getf.sig r16 = f8 125 mov ar.lc = r35 126 ;; 127 sub r18 = r0, r39 // -d1 128 add r14 = r17, r36 129 ;; 130 setf.sig f14 = r18 // -d1 131 cmp.leu p8, p9 = r17, r14 132 add r16 = r14, r16 133 ;; 134 (p9) adds r19 = 0, r0 135 (p8) adds r19 = -1, r0 136 cmp.gtu p6, p7 = r14, r16 137 ;; 138 (p6) adds r19 = 1, r19 139 ;; 140 ifelse(1,1,` 141 cmp.gt p7, p6 = r0, r19 142 ;; 143 (p6) adds r8 = -1, r8 // di-- 144 (p6) sub r14 = r16, r39 // t0 -= d1 145 (p6) cmp.ltu p6, p7 = r16, r39 // cy for: t0 - d1 146 ;; 147 (p6) cmp.gt p9, p8 = 1, r19 148 (p7) cmp.gt p9, p8 = 0, r19 149 (p6) adds r19 = -1, r19 // t1 -= cy 150 mov r16 = r14 151 ;; 152 (p8) adds r8 = -1, r8 // di-- 153 (p8) sub r14 = r16, r39 // t0 -= d1 154 (p8) cmp.ltu p8, p9 = r16, r39 // cy for: t0 - d1 155 ;; 156 (p8) cmp.gt p7, p6 = 1, r19 157 (p9) cmp.gt p7, p6 = 0, r19 158 (p8) adds r19 = -1, r19 // t1 -= cy 159 mov r16 = r14 160 ;; 161 (p6) adds r8 = -1, r8 // di-- 162 (p6) sub r14 = r16, r39 // t0 -= d1 163 (p6) cmp.ltu p6, p7 = r16, r39 // cy for: t0 - d1 164 ;; 165 (p6) cmp.gt p9, p8 = 1, r19 166 (p7) cmp.gt p9, p8 = 0, r19 167 (p6) adds r19 = -1, r19 // t1 -= cy 168 mov r16 = r14 169 ;; 170 (p8) adds r8 = -1, r8 // di-- 171 (p8) sub r14 = r16, r39 // t0 -= d1 172 (p8) cmp.ltu p8, p9 = r16, r39 // cy for: t0 - d1 173 ;; 174 (p8) adds r19 = -1, r19 // t1 -= cy 175 mov r16 = r14 176 ',` 177 cmp.gt p8, p9 = r0, r19 178 (p8) br.cond.dpnt .L46 179 .L52: 180 cmp.leu p6, p7 = r39, r16 181 sub r14 = r16, r39 182 adds r8 = -1, r8 183 ;; 184 (p7) adds r19 = -1, r19 185 mov r16 = r14 186 ;; 187 (p7) cmp.gt p8, p9 = r0, r19 188 (p9) br.cond.dptk .L52 189 .L46: 190 ') 191 setf.sig f32 = r8 // di 192 shladd r32 = r35, 3, r32 193 ;; 194 195 ALIGN(16) 196 L(top): nop 0 197 nop 0 198 cmp.gt p8, p9 = r33, r35 199 ;; 200 (p8) mov r37 = r0 201 (p9) ld8 r37 = [r34], -8 202 xma.hu f8 = f9, f32, f10 // 0,29 203 xma.l f12 = f9, f32, f10 // 0 204 ;; 205 getf.sig r20 = f12 // q0 4 206 xma.l f13 = f15, f8, f9 // q += n2 4 207 sub r8 = -1, r36 // bitnot d0 208 ;; 209 getf.sig r18 = f13 // 8 210 xma.l f7 = f14, f13, f10 // 8 211 xma.l f6 = f33, f13, f33 // t0 = LO(d0*q+d0) 8 212 xma.hu f9 = f33, f13, f33 // t1 = HI(d0*q+d0) 9 213 ;; 214 getf.sig r38 = f7 // n1 12 215 getf.sig r16 = f6 // 13 216 getf.sig r19 = f9 // 14 217 ;; 218 sub r38 = r38, r39 // n1 -= d1 17 219 ;; 220 cmp.ne p9, p0 = r0, r0 // clear p9 221 cmp.leu p10, p11 = r16, r37 // cy for: n0 - t0 18 222 ;; 223 sub r37 = r37, r16 // n0 -= t0 19 224 (p11) sub r38 = r38, r19, 1 // n1 -= t1 - cy 19 225 (p10) sub r38 = r38, r19 // n1 -= t1 19 226 ;; 227 cmp.gtu p6, p7 = r20, r38 // n1 >= q0 20 228 ;; 229 (p7) cmp.ltu p9, p0 = r8, r37 // 21 230 (p6) add r18 = 1, r18 // 231 (p7) add r37 = r37, r36 // 21 232 (p7) add r38 = r38, r39 // 21 233 ;; 234 setf.sig f10 = r37 // n1 22 235 (p9) add r38 = 1, r38 // 22 236 ;; 237 setf.sig f9 = r38 // n2 23 238 cmp.gtu p6, p7 = r39, r38 // 23 239 (p7) br.cond.spnt L(fix) 240 L(bck): st8 [r32] = r18, -8 241 adds r35 = -1, r35 242 br.cloop.sptk.few L(top) 243 ;; 244 245 L(end): add r14 = 8, r34 246 add r15 = 16, r34 247 mov b0 = r41 248 ;; 249 st8 [r14] = r37 250 st8 [r15] = r38 251 mov ar.pfs = r42 252 mov r8 = r40 253 mov ar.lc = r45 254 br.ret.sptk.many b0 255 ;; 256 .L51: 257 .pred.rel "mutex", p8, p9 258 sub r37 = r37, r36 259 (p9) sub r38 = r38, r39, 1 260 (p8) sub r38 = r38, r39 261 adds r40 = 1, r0 262 br .L8 263 ;; 264 265 L(fix): cmp.geu p6, p7 = r39, r38 266 cmp.leu p8, p9 = r36, r37 267 ;; 268 (p8) cmp4.ne.and.orcm p6, p7 = 0, r0 269 (p6) br.cond.dptk L(bck) 270 sub r37 = r37, r36 271 (p9) sub r38 = r38, r39, 1 272 (p8) sub r38 = r38, r39 273 adds r18 = 1, r18 274 ;; 275 setf.sig f9 = r38 // n2 276 setf.sig f10 = r37 // n1 277 br L(bck) 278 279 EPILOGUE() 280 ASM_END()