github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/ia64/gcd_1.asm (about) 1 dnl Itanium-2 mpn_gcd_1 -- mpn by 1 gcd. 2 3 dnl Contributed to the GNU project by Kevin Ryde, innerloop by Torbjorn 4 dnl Granlund. 5 6 dnl Copyright 2002-2005, 2012, 2013, 2015 Free Software Foundation, Inc. 7 8 dnl This file is part of the GNU MP Library. 9 dnl 10 dnl The GNU MP Library is free software; you can redistribute it and/or modify 11 dnl it under the terms of either: 12 dnl 13 dnl * the GNU Lesser General Public License as published by the Free 14 dnl Software Foundation; either version 3 of the License, or (at your 15 dnl option) any later version. 16 dnl 17 dnl or 18 dnl 19 dnl * the GNU General Public License as published by the Free Software 20 dnl Foundation; either version 2 of the License, or (at your option) any 21 dnl later version. 22 dnl 23 dnl or both in parallel, as here. 24 dnl 25 dnl The GNU MP Library is distributed in the hope that it will be useful, but 26 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 27 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 28 dnl for more details. 29 dnl 30 dnl You should have received copies of the GNU General Public License and the 31 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 32 dnl see https://www.gnu.org/licenses/. 33 34 include(`../config.m4') 35 36 37 C cycles/bitpair (1x1 gcd) 38 C Itanium: ? 39 C Itanium 2: 5.1 40 41 42 C mpn_gcd_1 (mp_srcptr xp, mp_size_t xsize, mp_limb_t y); 43 C 44 C The entry sequence is designed to expect xsize>1 and hence a modexact 45 C call. This ought to be more common than a 1x1 operation. Our critical 46 C path is thus stripping factors of 2 from y, calling modexact, then 47 C stripping factors of 2 from the x remainder returned. 48 C 49 C The common factors of 2 between x and y must be determined using the 50 C original x, not the remainder from the modexact. This is done with 51 C x_orig which is xp[0]. There's plenty of time to do this while the rest 52 C of the modexact etc is happening. 53 C 54 C It's possible xp[0] is zero. In this case the trailing zeros calculation 55 C popc((x-1)&~x) gives 63, and that's clearly no less than what y will 56 C have, making min(x_twos,y_twos) == y_twos. 57 C 58 C The main loop consists of transforming x,y to abs(x-y),min(x,y), and then 59 C stripping factors of 2 from abs(x-y). Those factors of two are 60 C determined from just y-x, without the abs(), since there's the same 61 C number of trailing zeros on n or -n in twos complement. That makes the 62 C dependent chain 8 cycles deep. 63 C 64 C The selection of x-y versus y-x for abs(x-y), and the selection of the 65 C minimum of x and y, is done in parallel with the critical path. 66 C 67 C The algorithm takes about 0.68 iterations per bit (two N bit operands) on 68 C average, hence the final 5.8 cycles/bitpair. 69 C 70 C Not done: 71 C 72 C An alternate algorithm which didn't strip all twos, but instead applied 73 C tbit and predicated extr on x, and then y, was attempted. The loop was 6 74 C cycles, but the algorithm is an average 1.25 iterations per bitpair for a 75 C total 7.25 c/bp, which is slower than the current approach. 76 C 77 C Alternatives: 78 C 79 C Perhaps we could do something tricky by extracting a few high bits and a 80 C few low bits from the operands, and looking up a table which would give a 81 C set of predicates to control some shifts or subtracts or whatever. That 82 C could knock off multiple bits per iteration. 83 C 84 C The right shifts are a bit of a bottleneck (shr at 2 or 3 cycles, or extr 85 C only going down I0), perhaps it'd be possible to shift left instead, 86 C using add. That would mean keeping track of the lowest not-yet-zeroed 87 C bit, using some sort of mask. 88 C 89 C TODO: 90 C * Once mod_1_N exists in assembly for Itanium, add conditional calls. 91 C * Call bmod_1 even for n=1 when up[0] >> v0 (like other gcd_1 impls). 92 C * Probably avoid popcnt also outside of loop, instead use ctz_table. 93 94 ASM_START() 95 .explicit C What does this mean? 96 97 C HP's assembler requires these declarations for importing mpn_modexact_1c_odd 98 .global mpn_modexact_1c_odd 99 .type mpn_modexact_1c_odd,@function 100 101 C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0. 102 103 deflit(MAXSHIFT, 7) 104 deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1)) 105 106 C .section ".rodata" 107 .rodata 108 ALIGN(m4_lshift(1,MAXSHIFT)) C align table to allow using dep 109 ctz_table: 110 data1 MAXSHIFT 111 forloop(i,1,MASK, 112 ` data1 m4_count_trailing_zeros(i) 113 ') 114 115 PROLOGUE(mpn_gcd_1) 116 117 C r32 xp 118 C r33 xsize 119 C r34 y 120 121 define(x, r8) 122 define(xp_orig, r32) 123 define(xsize, r33) 124 define(y, r34) define(inputs, 3) 125 define(save_rp, r35) 126 define(save_pfs, r36) 127 define(x_orig, r37) 128 define(x_orig_one, r38) 129 define(y_twos, r39) define(locals, 5) 130 define(out_xp, r40) 131 define(out_xsize, r41) 132 define(out_divisor, r42) 133 define(out_carry, r43) define(outputs, 4) 134 135 .prologue 136 {.mmi; 137 ifdef(`HAVE_ABI_32', 138 ` addp4 r9 = 0, xp_orig define(xp,r9)', C M0 139 ` define(xp,xp_orig)') 140 .save ar.pfs, save_pfs 141 alloc save_pfs = ar.pfs, inputs, locals, outputs, 0 C M2 142 .save rp, save_rp 143 mov save_rp = b0 C I0 144 }{.mbb; .body 145 add r10 = -1, y C M3 y-1 146 nop.b 0 C B0 147 nop.b 0 C B1 148 ;; 149 150 }{.mmi; ld8 x = [xp] C M0 x = xp[0] if no modexact 151 ld8 x_orig = [xp] C M1 orig x for common twos 152 cmp.ne p6,p0 = 1, xsize C I0 153 }{.mmi; andcm y_twos = r10, y C M2 (y-1)&~y 154 mov out_xp = xp_orig C M3 155 mov out_xsize = xsize C I1 156 ;; 157 }{.mmi; mov out_carry = 0 C M0 158 nop.m 0 C M1 159 popcnt y_twos = y_twos C I0 y twos 160 ;; 161 }{.mmi; add x_orig_one = -1, x_orig C M0 orig x-1 162 nop.m 0 C M1 163 shr.u out_divisor = y, y_twos C I0 y without twos 164 }{.mib; nop.m 0 C M2 165 shr.u y = y, y_twos C I1 y without twos 166 (p6) br.call.sptk.many b0 = mpn_modexact_1c_odd C if xsize>1 167 ;; 168 } 169 C modexact can leave x==0 170 {.mmi; cmp.eq p6,p0 = 0, x C M0 if {xp,xsize} % y == 0 171 andcm x_orig = x_orig_one, x_orig C M1 orig (x-1)&~x 172 add r9 = -1, x C I0 x-1 173 ;; 174 }{.mmi; andcm r9 = r9, x C M0 (x-1)&~x 175 nop.m 0 C M1 176 mov b0 = save_rp C I0 177 ;; 178 }{.mii; nop.m 0 C M0 179 popcnt x_orig = x_orig C I0 orig x twos 180 popcnt r9 = r9 C I0 x twos 181 ;; 182 }{.mmi; cmp.lt p7,p0 = x_orig, y_twos C M0 orig x_twos < y_twos 183 addl r22 = @ltoff(ctz_table), r1 184 shr.u x = x, r9 C I0 x odd 185 ;; 186 }{.mib; 187 (p7) mov y_twos = x_orig C M0 common twos 188 add r10 = -1, y C I0 y-1 189 (p6) br.dpnt.few L(done_y) C B0 x%y==0 then result y 190 ;; 191 } 192 mov r25 = m4_lshift(MASK, MAXSHIFT) 193 ld8 r22 = [r22] 194 br L(ent) 195 ;; 196 197 ALIGN(32) 198 L(top): 199 .pred.rel "mutex", p6,p7 200 {.mmi; (p7) mov y = x 201 (p6) sub x = x, y 202 dep r21 = r19, r22, 0, MAXSHIFT C concat(table,lowbits) 203 }{.mmi; and r20 = MASK, r19 204 (p7) mov x = r19 205 nop 0 206 ;; 207 } 208 L(mid): 209 {.mmb; ld1 r16 = [r21] 210 cmp.eq p10,p0 = 0, r20 211 (p10) br.spnt.few.clr L(shift_alot) 212 ;; 213 }{.mmi; nop 0 214 nop 0 215 shr.u x = x, r16 216 ;; 217 } 218 L(ent): 219 {.mmi; sub r19 = y, x 220 cmp.gtu p6,p7 = x, y 221 cmp.ne p8,p0 = x, y 222 }{.mmb; nop 0 223 nop 0 224 (p8) br.sptk.few.clr L(top) 225 } 226 227 L(done_y): C result is y 228 mov ar.pfs = save_pfs C I0 229 shl r8 = y, y_twos C I common factors of 2 230 br.ret.sptk.many b0 231 232 L(shift_alot): 233 and r20 = x, r25 234 shr.u x = x, MAXSHIFT 235 ;; 236 dep r21 = x, r22, 0, MAXSHIFT 237 br L(mid) 238 EPILOGUE()