github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/ia64/mode1o.asm (about)

     1  dnl  Itanium-2 mpn_modexact_1c_odd -- mpn by 1 exact remainder.
     2  
     3  dnl  Contributed to the GNU project by Kevin Ryde.
     4  
     5  dnl  Copyright 2003-2005 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  
    36  C            cycles/limb
    37  C Itanium:      15
    38  C Itanium 2:     8
    39  
    40  
    41  dnl  Usage: ABI32(`code')
    42  dnl
    43  dnl  Emit the given code only under HAVE_ABI_32.
    44  dnl
    45  define(ABI32,
    46  m4_assert_onearg()
    47  `ifdef(`HAVE_ABI_32',`$1')')
    48  
    49  
    50  C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
    51  C                                mp_limb_t divisor, mp_limb_t carry);
    52  C
    53  C The modexact algorithm is usually conceived as a dependent chain
    54  C
    55  C	l = src[i] - c
    56  C	q = low(l * inverse)
    57  C	c = high(q*divisor) + (src[i]<c)
    58  C
    59  C but we can work the src[i]-c into an xma by calculating si=src[i]*inverse
    60  C separately (off the dependent chain) and using
    61  C
    62  C	q = low(c * inverse + si)
    63  C	c = high(q*divisor + c)
    64  C
    65  C This means the dependent chain is simply xma.l followed by xma.hu, for a
    66  C total 8 cycles/limb on itanium-2.
    67  C
    68  C The reason xma.hu works for the new c is that the low of q*divisor is
    69  C src[i]-c (being the whole purpose of the q generated, and it can be
    70  C verified algebraically).  If there was an underflow from src[i]-c, then
    71  C there will be an overflow from (src-c)+c, thereby adding 1 to the new c
    72  C the same as the borrow bit (src[i]<c) gives in the first style shown.
    73  C
    74  C Incidentally, fcmp is not an option for treating src[i]-c, since it
    75  C apparently traps to the kernel for unnormalized operands like those used
    76  C and generated by ldf8 and xma.  On one GNU/Linux system it took about 1200
    77  C cycles.
    78  C
    79  C
    80  C First Limb:
    81  C
    82  C The first limb uses q = (src[0]-c) * inverse shown in the first style.
    83  C This lets us get the first q as soon as the inverse is ready, without
    84  C going through si=s*inverse.  Basically at the start we have c and can use
    85  C it while waiting for the inverse, whereas for the second and subsequent
    86  C limbs it's the other way around, ie. we have the inverse and are waiting
    87  C for c.
    88  C
    89  C At .Lentry the first two instructions in the loop have been done already.
    90  C The load of f11=src[1] at the start (predicated on size>=2), and the
    91  C calculation of q by the initial different scheme.
    92  C
    93  C
    94  C Entry Sequence:
    95  C
    96  C In the entry sequence, the critical path is the calculation of the
    97  C inverse, so this is begun first and optimized.  Apart from that, ar.lc is
    98  C established nice and early so the br.cloop's should predict perfectly.
    99  C And the load for the low limbs src[0] and src[1] can be initiated long
   100  C ahead of where they're needed.
   101  C
   102  C
   103  C Inverse Calculation:
   104  C
   105  C The initial 8-bit inverse is calculated using a table lookup.  If it hits
   106  C L1 (which is likely if we're called several times) then it should take a
   107  C total 4 cycles, otherwise hopefully L2 for 9 cycles.  This is considered
   108  C the best approach, on balance.  It could be done bitwise, but that would
   109  C probably be about 14 cycles (2 per bit beyond the first couple).  Or it
   110  C could be taken from 4 bits to 8 with xmpy doubling as used beyond 8 bits,
   111  C but that would be about 11 cycles.
   112  C
   113  C The table is not the same as binvert_limb_table, instead it's 256 bytes,
   114  C designed to be indexed by the low byte of the divisor.  The divisor is
   115  C always odd, so the relevant data is every second byte in the table.  The
   116  C padding lets us use zxt1 instead of extr.u, the latter would cost an extra
   117  C cycle because it must go down I0, and we're using the first I0 slot to get
   118  C ip.  The extra 128 bytes of padding should be insignificant compared to
   119  C typical ia64 code bloat.
   120  C
   121  C Having the table in .text allows us to use IP-relative addressing,
   122  C avoiding a fetch from ltoff.  .rodata is apparently not suitable for use
   123  C IP-relative, it gets a linker relocation overflow on GNU/Linux.
   124  C
   125  C
   126  C Load Scheduling:
   127  C
   128  C In the main loop, the data loads are scheduled for an L2 hit, which means
   129  C 6 cycles for the data ready to use.  In fact we end up 7 cycles ahead.  In
   130  C any case that scheduling is achieved simply by doing the load (and xmpy.l
   131  C for "si") in the immediately preceding iteration.
   132  C
   133  C The main loop requires size >= 2, and we handle size==1 by an initial
   134  C br.cloop to enter the loop only if size>1.  Since ar.lc is established
   135  C early, this should predict perfectly.
   136  C
   137  C
   138  C Not done:
   139  C
   140  C Consideration was given to using a plain "(src[0]-c) % divisor" for
   141  C size==1, but cycle counting suggests about 50 for the sort of approach
   142  C taken by gcc __umodsi3, versus about 47 for the modexact.  (Both assuming
   143  C L1 hits for their respective fetching.)
   144  C
   145  C Consideration was given to a test for high<divisor and replacing the last
   146  C loop iteration with instead c-=src[size-1] followed by c+=d if underflow.
   147  C Branching on high<divisor wouldn't be good since a mispredict would cost
   148  C more than the loop iteration saved, and the condition is of course data
   149  C dependent.  So the theory would be to shorten the loop count if
   150  C high<divisor, and predicate extra operations at the end.  That would mean
   151  C a gain of 6 when high<divisor, or a cost of 2 if not.
   152  C
   153  C Whether such a tradeoff is a win on average depends on assumptions about
   154  C how many bits in the high and the divisor.  If both are uniformly
   155  C distributed then high<divisor about 50% of the time.  But smallish
   156  C divisors (less chance of high<divisor) might be more likely from
   157  C applications (mpz_divisible_ui, mpz_gcd_ui, etc).  Though biggish divisors
   158  C would be normal internally from say mpn/generic/perfsqr.c.  On balance,
   159  C for the moment, it's felt the gain is not really enough to be worth the
   160  C trouble.
   161  C
   162  C
   163  C Enhancement:
   164  C
   165  C Process two source limbs per iteration using a two-limb inverse and a
   166  C sequence like
   167  C
   168  C	ql  = low (c * il + sil)	quotient low limb
   169  C	qlc = high(c * il + sil)
   170  C	qh1 = low (c * ih + sih)	quotient high, partial
   171  C
   172  C	cl = high (ql * d + c)		carry out of low
   173  C	qh = low (qlc * 1 + qh1)	quotient high limb
   174  C
   175  C	new c = high (qh * d + cl)	carry out of high
   176  C
   177  C This would be 13 cycles/iteration, giving 6.5 cycles/limb.  The two limb
   178  C s*inverse as sih:sil = sh:sl * ih:il would be calculated off the dependent
   179  C chain with 4 multiplies.  The bigger inverse would take extra time to
   180  C calculate, but a one limb iteration to handle an odd size could be done as
   181  C soon as 64-bits of inverse were ready.
   182  C
   183  C Perhaps this could even extend to a 3 limb inverse, which might promise 17
   184  C or 18 cycles for 3 limbs, giving 5.66 or 6.0 cycles/limb.
   185  C
   186  
   187  ASM_START()
   188  	.explicit
   189  
   190  	.text
   191  	.align	32
   192  .Ltable:
   193  data1	0,0x01, 0,0xAB, 0,0xCD, 0,0xB7, 0,0x39, 0,0xA3, 0,0xC5, 0,0xEF
   194  data1	0,0xF1, 0,0x1B, 0,0x3D, 0,0xA7, 0,0x29, 0,0x13, 0,0x35, 0,0xDF
   195  data1	0,0xE1, 0,0x8B, 0,0xAD, 0,0x97, 0,0x19, 0,0x83, 0,0xA5, 0,0xCF
   196  data1	0,0xD1, 0,0xFB, 0,0x1D, 0,0x87, 0,0x09, 0,0xF3, 0,0x15, 0,0xBF
   197  data1	0,0xC1, 0,0x6B, 0,0x8D, 0,0x77, 0,0xF9, 0,0x63, 0,0x85, 0,0xAF
   198  data1	0,0xB1, 0,0xDB, 0,0xFD, 0,0x67, 0,0xE9, 0,0xD3, 0,0xF5, 0,0x9F
   199  data1	0,0xA1, 0,0x4B, 0,0x6D, 0,0x57, 0,0xD9, 0,0x43, 0,0x65, 0,0x8F
   200  data1	0,0x91, 0,0xBB, 0,0xDD, 0,0x47, 0,0xC9, 0,0xB3, 0,0xD5, 0,0x7F
   201  data1	0,0x81, 0,0x2B, 0,0x4D, 0,0x37, 0,0xB9, 0,0x23, 0,0x45, 0,0x6F
   202  data1	0,0x71, 0,0x9B, 0,0xBD, 0,0x27, 0,0xA9, 0,0x93, 0,0xB5, 0,0x5F
   203  data1	0,0x61, 0,0x0B, 0,0x2D, 0,0x17, 0,0x99, 0,0x03, 0,0x25, 0,0x4F
   204  data1	0,0x51, 0,0x7B, 0,0x9D, 0,0x07, 0,0x89, 0,0x73, 0,0x95, 0,0x3F
   205  data1	0,0x41, 0,0xEB, 0,0x0D, 0,0xF7, 0,0x79, 0,0xE3, 0,0x05, 0,0x2F
   206  data1	0,0x31, 0,0x5B, 0,0x7D, 0,0xE7, 0,0x69, 0,0x53, 0,0x75, 0,0x1F
   207  data1	0,0x21, 0,0xCB, 0,0xED, 0,0xD7, 0,0x59, 0,0xC3, 0,0xE5, 0,0x0F
   208  data1	0,0x11, 0,0x3B, 0,0x5D, 0,0xC7, 0,0x49, 0,0x33, 0,0x55, 0,0xFF
   209  
   210  
   211  PROLOGUE(mpn_modexact_1c_odd)
   212  
   213  	C r32	src
   214  	C r33	size
   215  	C r34	divisor
   216  	C r35	carry
   217  
   218  	.prologue
   219  .Lhere:
   220  { .mmi;	add	r33 = -1, r33		C M0  size-1
   221  	mov	r14 = 2			C M1  2
   222  	mov	r15 = ip		C I0  .Lhere
   223  }{.mmi;	setf.sig f6 = r34		C M2  divisor
   224  	setf.sig f9 = r35		C M3  carry
   225  	zxt1	r3 = r34		C I1  divisor low byte
   226  }	;;
   227  
   228  { .mmi;	add	r3 = .Ltable-.Lhere, r3	C M0  table offset ip and index
   229  	sub	r16 = 0, r34		C M1  -divisor
   230  	.save	ar.lc, r2
   231  	mov	r2 = ar.lc		C I0
   232  }{.mmi;	.body
   233  	setf.sig f13 = r14		C M2  2 in significand
   234  	mov	r17 = -1		C M3  -1
   235  ABI32(`	zxt4	r33 = r33')		C I1  size extend
   236  }	;;
   237  
   238  { .mmi;	add	r3 = r3, r15		C M0  table entry address
   239  ABI32(` addp4	r32 = 0, r32')		C M1  src extend
   240  	mov	ar.lc = r33		C I0  size-1 loop count
   241  }{.mmi;	setf.sig f12 = r16		C M2  -divisor
   242  	setf.sig f8 = r17		C M3  -1
   243  }	;;
   244  
   245  { .mmi;	ld1	r3 = [r3]		C M0  inverse, 8 bits
   246  	ldf8	f10 = [r32], 8		C M1  src[0]
   247  	cmp.ne	p6,p0 = 0, r33		C I0  test size!=1
   248  }	;;
   249  
   250  	C Wait for table load.
   251  	C Hope for an L1 hit of 1 cycles to ALU, but could be more.
   252  	setf.sig f7 = r3		C M2  inverse, 8 bits
   253  (p6)	ldf8	f11 = [r32], 8		C M1  src[1], if size!=1
   254  	;;
   255  
   256  	C 5 cycles
   257  
   258  	C f6	divisor
   259  	C f7	inverse, being calculated
   260  	C f8	-1, will be -inverse
   261  	C f9	carry
   262  	C f10	src[0]
   263  	C f11	src[1]
   264  	C f12	-divisor
   265  	C f13	2
   266  	C f14	scratch
   267  
   268  	xmpy.l	f14 = f13, f7		C 2*i
   269  	xmpy.l	f7 = f7, f7		C i*i
   270  	;;
   271  	xma.l	f7 = f7, f12, f14	C i*i*-d + 2*i, inverse 16 bits
   272  	;;
   273  
   274  	xmpy.l	f14 = f13, f7		C 2*i
   275  	xmpy.l	f7 = f7, f7		C i*i
   276  	;;
   277  	xma.l	f7 = f7, f12, f14	C i*i*-d + 2*i, inverse 32 bits
   278  	;;
   279  
   280  	xmpy.l	f14 = f13, f7		C 2*i
   281  	xmpy.l	f7 = f7, f7		C i*i
   282  	;;
   283  
   284  	xma.l	f7 = f7, f12, f14	C i*i*-d + 2*i, inverse 64 bits
   285  	xma.l	f10 = f9, f8, f10	C sc = c * -1 + src[0]
   286  	;;
   287  ASSERT(p6, `
   288  	xmpy.l	f15 = f6, f7 ;;	C divisor*inverse
   289  	getf.sig r31 = f15 ;;
   290  	cmp.eq	p6,p0 = 1, r31	C should == 1
   291  ')
   292  
   293  	xmpy.l	f10 = f10, f7		C q = sc * inverse
   294  	xmpy.l	f8 = f7, f8		C -inverse = inverse * -1
   295  	br.cloop.sptk.few.clr .Lentry	C main loop, if size > 1
   296  	;;
   297  
   298  	C size==1, finish up now
   299  	xma.hu	f9 = f10, f6, f9	C c = high(q * divisor + c)
   300  	mov	ar.lc = r2		C I0
   301  	;;
   302  	getf.sig r8 = f9		C M2  return c
   303  	br.ret.sptk.many b0
   304  
   305  
   306  
   307  .Ltop:
   308  	C r2	saved ar.lc
   309  	C f6	divisor
   310  	C f7	inverse
   311  	C f8	-inverse
   312  	C f9	carry
   313  	C f10	src[i] * inverse
   314  	C f11	scratch src[i+1]
   315  
   316  	add	r16 = 160, r32
   317  	ldf8	f11 = [r32], 8		C src[i+1]
   318  	;;
   319  	C 2 cycles
   320  
   321  	lfetch	[r16]
   322  	xma.l	f10 = f9, f8, f10	C q = c * -inverse + si
   323  	;;
   324  	C 3 cycles
   325  
   326  .Lentry:
   327  	xma.hu	f9 = f10, f6, f9	C c = high(q * divisor + c)
   328  	xmpy.l	f10 = f11, f7		C si = src[i] * inverse
   329  	br.cloop.sptk.few.clr .Ltop
   330  	;;
   331  
   332  
   333  
   334  	xma.l	f10 = f9, f8, f10	C q = c * -inverse + si
   335  	mov	ar.lc = r2		C I0
   336  	;;
   337  	xma.hu	f9 = f10, f6, f9	C c = high(q * divisor + c)
   338  	;;
   339  	getf.sig r8 = f9		C M2  return c
   340  	br.ret.sptk.many b0
   341  
   342  EPILOGUE()