github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/ia64/dive_1.asm (about)

     1  dnl  IA-64 mpn_divexact_1 -- mpn by limb exact division.
     2  
     3  dnl  Contributed to the GNU project by Torbjorn Granlund and Kevin Ryde.
     4  
     5  dnl  Copyright 2003-2005, 2010 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  C            cycles/limb
    36  C Itanium:      16
    37  C Itanium 2:     8
    38  
    39  C INPUT PARAMETERS
    40  define(`rp', `r32')
    41  define(`up', `r33')
    42  define(`n',  `r34')
    43  define(`divisor', `r35')
    44  
    45  define(`lshift', `r24')
    46  define(`rshift', `r25')
    47  
    48  C This code is a bit messy, and not as similar to mode1o.asm as desired.
    49  
    50  C The critical path during initialization is for computing the inverse of the
    51  C divisor.  Since odd divisors are probably common, we conditionally execute
    52  C the initial count_trailing_zeros code and the downshift.
    53  
    54  C Possible improvement: Merge more of the feed-in code into the inverse
    55  C computation.
    56  
    57  ASM_START()
    58  	.text
    59  	.align	32
    60  .Ltab:
    61  data1	0,0x01, 0,0xAB, 0,0xCD, 0,0xB7, 0,0x39, 0,0xA3, 0,0xC5, 0,0xEF
    62  data1	0,0xF1, 0,0x1B, 0,0x3D, 0,0xA7, 0,0x29, 0,0x13, 0,0x35, 0,0xDF
    63  data1	0,0xE1, 0,0x8B, 0,0xAD, 0,0x97, 0,0x19, 0,0x83, 0,0xA5, 0,0xCF
    64  data1	0,0xD1, 0,0xFB, 0,0x1D, 0,0x87, 0,0x09, 0,0xF3, 0,0x15, 0,0xBF
    65  data1	0,0xC1, 0,0x6B, 0,0x8D, 0,0x77, 0,0xF9, 0,0x63, 0,0x85, 0,0xAF
    66  data1	0,0xB1, 0,0xDB, 0,0xFD, 0,0x67, 0,0xE9, 0,0xD3, 0,0xF5, 0,0x9F
    67  data1	0,0xA1, 0,0x4B, 0,0x6D, 0,0x57, 0,0xD9, 0,0x43, 0,0x65, 0,0x8F
    68  data1	0,0x91, 0,0xBB, 0,0xDD, 0,0x47, 0,0xC9, 0,0xB3, 0,0xD5, 0,0x7F
    69  data1	0,0x81, 0,0x2B, 0,0x4D, 0,0x37, 0,0xB9, 0,0x23, 0,0x45, 0,0x6F
    70  data1	0,0x71, 0,0x9B, 0,0xBD, 0,0x27, 0,0xA9, 0,0x93, 0,0xB5, 0,0x5F
    71  data1	0,0x61, 0,0x0B, 0,0x2D, 0,0x17, 0,0x99, 0,0x03, 0,0x25, 0,0x4F
    72  data1	0,0x51, 0,0x7B, 0,0x9D, 0,0x07, 0,0x89, 0,0x73, 0,0x95, 0,0x3F
    73  data1	0,0x41, 0,0xEB, 0,0x0D, 0,0xF7, 0,0x79, 0,0xE3, 0,0x05, 0,0x2F
    74  data1	0,0x31, 0,0x5B, 0,0x7D, 0,0xE7, 0,0x69, 0,0x53, 0,0x75, 0,0x1F
    75  data1	0,0x21, 0,0xCB, 0,0xED, 0,0xD7, 0,0x59, 0,0xC3, 0,0xE5, 0,0x0F
    76  data1	0,0x11, 0,0x3B, 0,0x5D, 0,0xC7, 0,0x49, 0,0x33, 0,0x55, 0,0xFF
    77  
    78  
    79  PROLOGUE(mpn_divexact_1)
    80  	.prologue
    81  	.save		ar.lc, r2
    82  	.body
    83  
    84   {.mmi;	add		r8 = -1, divisor	C M0
    85  	nop		0			C M1
    86  	tbit.z		p8, p9 = divisor, 0	C I0
    87  }
    88  ifdef(`HAVE_ABI_32',
    89  `	addp4		rp = 0, rp		C M2  rp extend
    90  	addp4		up = 0, up		C M3  up extend
    91  	sxt4		n = n')			C I1  size extend
    92  	;;
    93  .Lhere:
    94   {.mmi;	ld8		r20 = [up], 8		C M0  up[0]
    95    (p8)	andcm		r8 = r8, divisor	C M1
    96  	mov		r15 = ip		C I0  .Lhere
    97  	;;
    98  }{.mii
    99  	.pred.rel "mutex", p8, p9
   100    (p9)	mov		rshift = 0		C M0
   101    (p8)	popcnt		rshift = r8		C I0 r8 = cnt_lo_zeros(divisor)
   102  	cmp.eq		p6, p10 = 1, n		C I1
   103  	;;
   104  }{.mii;	add		r9 = .Ltab-.Lhere, r15	C M0
   105    (p8)	shr.u		divisor = divisor, rshift C I0
   106  	nop		0			C I1
   107  	;;
   108  }{.mmi;	add		n = -4, n		C M0  size-1
   109    (p10)	ld8		r21 = [up], 8		C M1  up[1]
   110  	mov		r14 = 2			C M1  2
   111  }{.mfi;	setf.sig	f6 = divisor		C M2  divisor
   112  	mov		f9 = f0			C M3  carry		FIXME
   113  	zxt1		r3 = divisor		C I1  divisor low byte
   114  	;;
   115  }{.mmi;	add		r3 = r9, r3		C M0  table offset ip and index
   116  	sub		r16 = 0, divisor	C M1  -divisor
   117  	mov		r2 = ar.lc		C I0
   118  }{.mmi;	sub		lshift = 64, rshift	C M2
   119  	setf.sig	f13 = r14		C M3  2 in significand
   120  	mov		r17 = -1		C I1  -1
   121  	;;
   122  }{.mmi;	ld1		r3 = [r3]		C M0  inverse, 8 bits
   123  	nop		0			C M1
   124  	mov		ar.lc = n		C I0  size-1 loop count
   125  }{.mmi;	setf.sig	f12 = r16		C M2  -divisor
   126  	setf.sig	f8 = r17		C M3  -1
   127  	cmp.eq		p7, p0 = -2, n		C I1
   128  	;;
   129  }{.mmi;	setf.sig	f7 = r3			C M2  inverse, 8 bits
   130  	cmp.eq		p8, p0 = -1, n		C M0
   131  	shr.u		r23 = r20, rshift	C I0
   132  	;;
   133  }
   134  
   135  	C f6	divisor
   136  	C f7	inverse, being calculated
   137  	C f8	-1, will be -inverse
   138  	C f9	carry
   139  	C f12	-divisor
   140  	C f13	2
   141  	C f14	scratch
   142  
   143  	xmpy.l		f14 = f13, f7		C Newton 2*i
   144  	xmpy.l		f7 = f7, f7		C Newton i*i
   145  	;;
   146  	xma.l		f7 = f7, f12, f14	C Newton i*i*-d + 2*i, 16 bits
   147  	;;
   148  	setf.sig	f10 = r23		C speculative, used iff n = 1
   149  	xmpy.l		f14 = f13, f7		C Newton 2*i
   150  	shl		r22 = r21, lshift	C speculative, used iff n > 1
   151  	xmpy.l		f7 = f7, f7		C Newton i*i
   152  	;;
   153  	or		r31 = r22, r23		C speculative, used iff n > 1
   154  	xma.l		f7 = f7, f12, f14	C Newton i*i*-d + 2*i, 32 bits
   155  	shr.u		r23 = r21, rshift	C speculative, used iff n > 1
   156  	;;
   157  	setf.sig	f11 = r31		C speculative, used iff n > 1
   158  	xmpy.l		f14 = f13, f7		C Newton 2*i
   159  	xmpy.l		f7 = f7, f7		C Newton i*i
   160  	;;
   161  	xma.l		f7 = f7, f12, f14	C Newton i*i*-d + 2*i, 64 bits
   162  
   163    (p7)	br.cond.dptk	.Ln2
   164    (p10)	br.cond.dptk	.grt3
   165  	;;
   166  
   167  .Ln1:	xmpy.l		f12 = f10, f7		C q = ulimb * inverse
   168  	br		.Lx1
   169  
   170  .Ln2:
   171  	xmpy.l		f8 = f7, f8		C -inverse = inverse * -1
   172  	xmpy.l		f12 = f11, f7		C q = ulimb * inverse
   173  	setf.sig	f11 = r23
   174  	br		.Lx2
   175  
   176  .grt3:
   177  	ld8		r21 = [up], 8		C up[2]
   178  	xmpy.l		f8 = f7, f8		C -inverse = inverse * -1
   179  	;;
   180  	shl		r22 = r21, lshift
   181  	;;
   182  	xmpy.l		f12 = f11, f7		C q = ulimb * inverse
   183  	;;
   184  	or		r31 = r22, r23
   185  	shr.u		r23 = r21, rshift
   186  	;;
   187  	setf.sig	f11 = r31
   188    (p8)	br.cond.dptk	.Lx3			C branch for n = 3
   189  	;;
   190  	ld8		r21 = [up], 8
   191  	br		.Lent
   192  
   193  .Ltop:	ld8		r21 = [up], 8
   194  	xma.l		f12 = f9, f8, f10	C q = c * -inverse + si
   195  	nop.b		0
   196  	;;
   197  .Lent:	add		r16 = 160, up
   198  	shl		r22 = r21, lshift
   199  	nop.b		0
   200  	;;
   201  	stf8		[rp] = f12, 8
   202  	xma.hu		f9 = f12, f6, f9	C c = high(q * divisor + c)
   203  	nop.b		0
   204  	nop.m		0
   205  	xmpy.l		f10 = f11, f7		C si = ulimb * inverse
   206  	nop.b		0
   207  	;;
   208  	or		r31 = r22, r23
   209  	shr.u		r23 = r21, rshift
   210  	nop.b		0
   211  	;;
   212  	lfetch		[r16]
   213  	setf.sig	f11 = r31
   214  	br.cloop.sptk.few.clr .Ltop
   215  
   216  
   217  	xma.l		f12 = f9, f8, f10	C q = c * -inverse + si
   218  	;;
   219  .Lx3:	stf8		[rp] = f12, 8
   220  	xma.hu		f9 = f12, f6, f9	C c = high(q * divisor + c)
   221  	xmpy.l		f10 = f11, f7		C si = ulimb * inverse
   222  	;;
   223  	setf.sig	f11 = r23
   224  	;;
   225  	xma.l		f12 = f9, f8, f10	C q = c * -inverse + si
   226  	;;
   227  .Lx2:	stf8		[rp] = f12, 8
   228  	xma.hu		f9 = f12, f6, f9	C c = high(q * divisor + c)
   229  	xmpy.l		f10 = f11, f7		C si = ulimb * inverse
   230  	;;
   231  	xma.l		f12 = f9, f8, f10	C q = c * -inverse + si
   232  	;;
   233  .Lx1:	stf8		[rp] = f12, 8
   234  	mov		ar.lc = r2		C I0
   235  	br.ret.sptk.many b0
   236  EPILOGUE()