github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/ia64/divrem_1.asm (about)

     1  dnl  IA-64 mpn_divrem_1 and mpn_preinv_divrem_1 -- Divide an mpn number by an
     2  dnl  unnormalized limb.
     3  
     4  dnl  Contributed to the GNU project by Torbjorn Granlund.
     5  
     6  dnl  Copyright 2002, 2004, 2005 Free Software Foundation, Inc.
     7  
     8  dnl  This file is part of the GNU MP Library.
     9  dnl
    10  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    11  dnl  it under the terms of either:
    12  dnl
    13  dnl    * the GNU Lesser General Public License as published by the Free
    14  dnl      Software Foundation; either version 3 of the License, or (at your
    15  dnl      option) any later version.
    16  dnl
    17  dnl  or
    18  dnl
    19  dnl    * the GNU General Public License as published by the Free Software
    20  dnl      Foundation; either version 2 of the License, or (at your option) any
    21  dnl      later version.
    22  dnl
    23  dnl  or both in parallel, as here.
    24  dnl
    25  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    26  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    27  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    28  dnl  for more details.
    29  dnl
    30  dnl  You should have received copies of the GNU General Public License and the
    31  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    32  dnl  see https://www.gnu.org/licenses/.
    33  
    34  include(`../config.m4')
    35  
    36  
    37  C         cycles/limb
    38  C Itanium:    40-42
    39  C Itanium 2:  29-30
    40  
    41  C This was generated by gcc, then the loops were optimized.  The preinv entry
    42  C point was shoehorned into the file.  Lots of things outside the loops could
    43  C be streamlined.  It would probably be a good idea to merge the loops for
    44  C normalized and unnormalized divisor, since the shifting stuff is done for
    45  C free in parallel with other operations.  It would even be possible to merge
    46  C all loops, if the ld8 were made conditional.
    47  
    48  C TODO
    49  C  * Consider delaying inversion for normalized mpn_divrem_1 entry till after
    50  C    computing leading limb.
    51  C  * Inline and interleave limb inversion code with loop setup code.
    52  
    53  ASM_START()
    54  
    55  C HP's assembler requires these declarations for importing mpn_invert_limb
    56  	.global	mpn_invert_limb
    57  	.type	mpn_invert_limb,@function
    58  
    59  C INPUT PARAMETERS
    60  C rp    = r32
    61  C qxn   = r33
    62  C up    = r34
    63  C n     = r35
    64  C vl    = r36
    65  C vlinv = r37  (preinv only)
    66  C cnt = r38    (preinv only)
    67  
    68  PROLOGUE(mpn_preinv_divrem_1)
    69  	.prologue
    70  	.save	ar.pfs, r42
    71  	alloc		r42 = ar.pfs, 7, 8, 1, 0
    72  	.save	ar.lc, r44
    73  	mov		r44 = ar.lc
    74  	.save	rp, r41
    75  	mov		r41 = b0
    76  	.body
    77  ifdef(`HAVE_ABI_32',
    78  `	addp4		r32 = 0, r32
    79  	sxt4		r33 = r33
    80  	addp4		r34 = 0, r34
    81  	sxt4		r35 = r35
    82  	;;
    83  ')
    84  	mov		r40 = r38
    85  	shladd		r34 = r35, 3, r34
    86  	;;
    87  	adds		r34 = -8, r34
    88  	;;
    89  	ld8		r39 = [r34], -8
    90  	;;
    91  
    92  	add		r15 = r35, r33
    93  	;;
    94  	mov		r8 = r37
    95  	shladd		r32 = r15, 3, r32	C r32 = rp + n + qxn
    96  	cmp.le		p8, p0 = 0, r36
    97  	;;
    98  	adds		r32 = -8, r32		C r32 = rp + n + qxn - 1
    99  	cmp.leu		p6, p7 = r36, r39
   100     (p8)	br.cond.dpnt	.Lpunnorm
   101  	;;
   102  
   103     (p6)	addl		r15 = 1, r0
   104     (p7)	mov		r15 = r0
   105  	;;
   106     (p6)	sub		r38 = r39, r36
   107     (p7)	mov		r38 = r39
   108  	st8		[r32] = r15, -8
   109  	adds		r35 = -2, r35		C un -= 2
   110  	br	.Lpn
   111  
   112  .Lpunnorm:
   113     (p6)	add		r34 = 8, r34
   114  	mov		r38 = 0			C r = 0
   115  	shl		r36 = r36, r40
   116     (p6)	br.cond.dptk	.Lpu
   117  	;;
   118  	shl		r38 = r39, r40		C r = ahigh << cnt
   119  	cmp.ne		p8, p0 = 1, r35
   120  	st8		[r32] = r0, -8
   121  	adds		r35 = -1, r35		C un--
   122     (p8)	br.cond.dpnt	.Lpu
   123  
   124  	mov		r23 = 1
   125  	;;
   126  	setf.sig	f6 = r8
   127  	setf.sig	f12 = r23
   128  	br		.L435
   129  EPILOGUE()
   130  
   131  
   132  PROLOGUE(mpn_divrem_1)
   133  	.prologue
   134  	.save	ar.pfs, r42
   135  	alloc		r42 = ar.pfs, 5, 8, 1, 0
   136  	.save	ar.lc, r44
   137  	mov		r44 = ar.lc
   138  	.save	rp, r41
   139  	mov		r41 = b0
   140  	.body
   141  ifdef(`HAVE_ABI_32',
   142  `	addp4		r32 = 0, r32
   143  	sxt4		r33 = r33
   144  	addp4		r34 = 0, r34
   145  	sxt4		r35 = r35
   146  	;;
   147  ')
   148  	mov		r38 = r0
   149  	add		r15 = r35, r33
   150  	;;
   151  	cmp.ne		p6, p7 = 0, r15
   152  	;;
   153     (p7)	mov		r8 = r0
   154     (p7)	br.cond.dpnt	.Lret
   155  	shladd		r14 = r15, 3, r32	C r14 = rp + n + qxn
   156  	cmp.le		p6, p7 = 0, r36
   157  	;;
   158  	adds		r32 = -8, r14		C r32 = rp + n + qxn - 1
   159     (p6)	br.cond.dpnt	.Lunnorm
   160  	cmp.eq		p6, p7 = 0, r35
   161     (p6)	br.cond.dpnt	.L179
   162  	shladd		r14 = r35, 3, r34
   163  	;;
   164  	adds		r14 = -8, r14
   165  	adds		r35 = -1, r35
   166  	;;
   167  	ld8		r38 = [r14]
   168  	;;
   169  	cmp.leu		p6, p7 = r36, r38
   170  	;;
   171     (p6)	addl		r15 = 1, r0
   172     (p7)	mov		r15 = r0
   173  	;;
   174  	st8		[r32] = r15, -8
   175    (p6)	sub		r38 = r38, r36
   176  
   177  .L179:
   178  	mov		r45 = r36
   179  	adds		r35 = -1, r35
   180  	br.call.sptk.many b0 = mpn_invert_limb
   181  	;;
   182  	shladd		r34 = r35, 3, r34
   183  .Lpn:
   184  	mov		r23 = 1
   185  	;;
   186  	setf.sig	f6 = r8
   187  	setf.sig	f12 = r23
   188  	cmp.le		p6, p7 = 0, r35
   189  	mov		r40 = 0
   190     (p7)	br.cond.dpnt	.L435
   191  	setf.sig	f10 = r36
   192  	mov		ar.lc = r35
   193  	setf.sig	f7 = r38
   194  	;;
   195  	sub		r28 = -1, r36
   196  C Develop quotient limbs for normalized divisor
   197  .Loop1:		C 00				C q=r18 nh=r38/f7
   198  	ld8		r20 = [r34], -8
   199  	xma.hu		f11 = f7, f6, f0
   200  	;;	C 04
   201  	xma.l		f8 = f11, f12, f7	C q = q + nh
   202  	;;	C 08
   203  	getf.sig	r18 = f8
   204  	xma.hu		f9 = f8, f10, f0
   205  	xma.l		f8 = f8, f10, f0
   206  	;;	C 12
   207  	getf.sig	r16 = f9
   208  		C 13
   209  	getf.sig	r15 = f8
   210  	;;	C 18
   211  	cmp.ltu		p6, p7 = r20, r15
   212  	sub		r15 = r20, r15
   213  	sub		r16 = r38, r16
   214  	;;	C 19
   215     (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0?
   216     (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0?
   217     (p6)	add		r16 = -1, r16
   218     (p0)	cmp.ne.unc	p6, p7 = r0, r0
   219  	;;	C 20
   220     (p8)	cmp.ltu		p6, p7 = r15, r36
   221     (p8)	sub		r15 = r15, r36
   222     (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
   223  	;;	C 21
   224  	.pred.rel "mutex",p6,p7
   225     (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0 still?
   226     (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0 still?
   227  	cmp.ltu		p6, p7 = r15, r36	C speculative
   228  	sub		r28 = r15, r36		C speculative, just for cmp
   229  	;;	C 22
   230     (p8)	cmp.ltu		p6, p7 = r28, r36	C redo last cmp if needed
   231     (p8)	mov		r15 = r28
   232     (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
   233  	;;	C 23
   234     (p6)	setf.sig	f7 = r15
   235     (p7)	sub		r15 = r15, r36
   236     (p7)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
   237  	;;	C 24
   238     (p7)	setf.sig	f7 = r15
   239  	st8		[r32] = r18, -8
   240  	mov		r38 = r15
   241  	br.cloop.dptk	.Loop1
   242  		C 29/30
   243  	br.sptk		.L435
   244  	;;
   245  .Lunnorm:
   246  	mux1		r16 = r36, @rev
   247  	cmp.eq		p6, p7 = 0, r35
   248     (p6)	br.cond.dpnt	.L322
   249  	shladd		r34 = r35, 3, r34
   250  	;;
   251  	adds		r34 = -8, r34
   252  	;;
   253  	ld8		r39 = [r34]
   254  	;;
   255  	cmp.leu		p6, p7 = r36, r39
   256     (p6)	br.cond.dptk	.L322
   257  	adds		r34 = -8, r34
   258  	;;
   259  	mov		r38 = r39
   260  	;;
   261  	cmp.ne		p6, p7 = 1, r15
   262  	st8		[r32] = r0, -8
   263  	;;
   264     (p7)	mov		r8 = r38
   265     (p7)	br.cond.dpnt	.Lret
   266  	adds		r35 = -1, r35
   267  .L322:
   268  	sub		r14 = r0, r16
   269  	;;
   270  	or		r14 = r16, r14
   271  	;;
   272  	mov		r16 = -8
   273  	czx1.l		r14 = r14
   274  	;;
   275  	shladd		r16 = r14, 3, r16
   276  	;;
   277  	shr.u		r14 = r36, r16
   278  	;;
   279  	cmp.geu		p6, p7 = 15, r14
   280  	;;
   281     (p7)	shr.u		r14 = r14, 4
   282     (p7)	adds		r16 = 4, r16
   283  	;;
   284  	cmp.geu		p6, p7 = 3, r14
   285  	;;
   286     (p7)	shr.u		r14 = r14, 2
   287     (p7)	adds		r16 = 2, r16
   288  	;;
   289  	tbit.nz		p6, p7 = r14, 1
   290  	;;
   291  	.pred.rel "mutex",p6,p7
   292    (p6)	sub		r40 = 62, r16
   293    (p7)	sub		r40 = 63, r16
   294  	;;
   295  	shl		r45 = r36, r40
   296  	shl		r36 = r36, r40
   297  	shl		r38 = r38, r40
   298  	br.call.sptk.many b0 = mpn_invert_limb
   299  	;;
   300  .Lpu:
   301  	mov		r23 = 1
   302  	;;
   303  	setf.sig	f6 = r8
   304  	setf.sig	f12 = r23
   305  	cmp.eq		p6, p7 = 0, r35
   306     (p6)	br.cond.dpnt	.L435
   307  	sub		r16 = 64, r40
   308  	adds		r35 = -2, r35
   309  	;;
   310  	ld8		r39 = [r34], -8
   311  	cmp.le		p6, p7 = 0, r35
   312  	;;
   313  	shr.u		r14 = r39, r16
   314  	;;
   315  	or		r38 = r14, r38
   316     (p7)	br.cond.dpnt	.Lend3
   317  	;;
   318  	mov		r22 = r16
   319  	setf.sig	f10 = r36
   320  	setf.sig	f7 = r38
   321  	mov		ar.lc = r35
   322  	;;
   323  C Develop quotient limbs for unnormalized divisor
   324  .Loop3:
   325  	ld8		r14 = [r34], -8
   326  	xma.hu		f11 = f7, f6, f0
   327  	;;
   328  	xma.l		f8 = f11, f12, f7	C q = q + nh
   329  	;;
   330  	getf.sig	r18 = f8
   331  	xma.hu		f9 = f8, f10, f0
   332  	shl		r20 = r39, r40
   333  	xma.l		f8 = f8, f10, f0
   334  	shr.u		r24 = r14, r22
   335  	;;
   336  	getf.sig	r16 = f9
   337  	getf.sig	r15 = f8
   338  	or		r20 = r24, r20
   339  	;;
   340  	cmp.ltu		p6, p7 = r20, r15
   341  	sub		r15 = r20, r15
   342  	sub		r16 = r38, r16
   343  	;;
   344     (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0?
   345     (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0?
   346     (p6)	add		r16 = -1, r16
   347     (p0)	cmp.ne.unc	p6, p7 = r0, r0
   348  	;;
   349     (p8)	cmp.ltu		p6, p7 = r15, r36
   350     (p8)	sub		r15 = r15, r36
   351     (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
   352  	;;
   353  	.pred.rel "mutex",p6,p7
   354     (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0 still?
   355     (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0 still?
   356  	cmp.ltu		p6, p7 = r15, r36	C speculative
   357  	sub		r28 = r15, r36		C speculative, just for cmp
   358  	;;
   359     (p8)	cmp.ltu		p6, p7 = r28, r36	C redo last cmp if needed
   360     (p8)	mov		r15 = r28
   361     (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
   362  	;;
   363     (p6)	setf.sig	f7 = r15
   364     (p7)	sub		r15 = r15, r36
   365     (p7)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
   366  	;;
   367     (p7)	setf.sig	f7 = r15
   368  	st8		[r32] = r18, -8
   369  	mov		r39 = r14
   370  	mov		r38 = r15
   371  	br.cloop.dptk	.Loop3
   372  	;;
   373  .Lend3:
   374  	setf.sig	f10 = r36
   375  	setf.sig	f7 = r38
   376  	;;
   377  	xma.hu		f11 = f7, f6, f0
   378  	;;
   379  	xma.l		f8 = f11, f12, f7	C q = q + nh
   380  	;;
   381  	getf.sig	r18 = f8
   382  	xma.hu		f9 = f8, f10, f0
   383  	shl		r20 = r39, r40
   384  	xma.l		f8 = f8, f10, f0
   385  	;;
   386  	getf.sig	r16 = f9
   387  	getf.sig	r15 = f8
   388  	;;
   389  	cmp.ltu		p6, p7 = r20, r15
   390  	sub		r15 = r20, r15
   391  	sub		r16 = r38, r16
   392  	;;
   393     (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0?
   394     (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0?
   395     (p6)	add		r16 = -1, r16
   396     (p0)	cmp.ne.unc	p6, p7 = r0, r0
   397  	;;
   398     (p8)	cmp.ltu		p6, p7 = r15, r36
   399     (p8)	sub		r15 = r15, r36
   400     (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
   401  	;;
   402  	.pred.rel "mutex",p6,p7
   403     (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0 still?
   404     (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0 still?
   405  	;;
   406     (p8)	sub		r15 = r15, r36
   407     (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
   408  	;;
   409  	cmp.ltu		p6, p7 = r15, r36
   410  	;;
   411     (p7)	sub		r15 = r15, r36
   412     (p7)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
   413  	;;
   414  	st8		[r32] = r18, -8
   415  	mov		r38 = r15
   416  .L435:
   417  	adds		r35 = -1, r33
   418  	cmp.le		p6, p7 = 1, r33
   419     (p7)	br.cond.dpnt	.Lend4
   420  	;;
   421  	setf.sig	f7 = r38
   422  	setf.sig	f10 = r36
   423  	mov		ar.lc = r35
   424  	;;
   425  .Loop4:
   426  	xma.hu		f11 = f7, f6, f0
   427  	;;
   428  	xma.l		f8 = f11, f12, f7	C q = q + nh
   429  	;;
   430  	getf.sig	r18 = f8
   431  	xma.hu		f9 = f8, f10, f0
   432  	xma.l		f8 = f8, f10, f0
   433  	;;
   434  	getf.sig	r16 = f9
   435  	getf.sig	r15 = f8
   436  	;;
   437  	cmp.ltu		p6, p7 = 0, r15
   438  	sub		r15 = 0, r15
   439  	sub		r16 = r38, r16
   440  	;;
   441     (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0?
   442     (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0?
   443     (p6)	add		r16 = -1, r16
   444     (p0)	cmp.ne.unc	p6, p7 = r0, r0
   445  	;;
   446     (p8)	cmp.ltu		p6, p7 = r15, r36
   447     (p8)	sub		r15 = r15, r36
   448     (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
   449  	;;
   450  	.pred.rel "mutex",p6,p7
   451     (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0 still?
   452     (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0 still?
   453  	cmp.ltu		p6, p7 = r15, r36	C speculative
   454  	sub		r28 = r15, r36		C speculative, just for cmp
   455  	;;
   456     (p8)	cmp.ltu		p6, p7 = r28, r36	C redo last cmp if needed
   457     (p8)	mov		r15 = r28
   458     (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
   459  	;;
   460     (p6)	setf.sig	f7 = r15
   461     (p7)	sub		r15 = r15, r36
   462     (p7)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
   463  	;;
   464     (p7)	setf.sig	f7 = r15
   465  	st8		[r32] = r18, -8
   466  	mov		r38 = r15
   467  	br.cloop.dptk	.Loop4
   468  	;;
   469  .Lend4:
   470  	shr.u		r8 = r38, r40
   471  .Lret:
   472  	mov		ar.pfs = r42
   473  	mov		ar.lc = r44
   474  	mov		b0 = r41
   475  	br.ret.sptk.many b0
   476  EPILOGUE()
   477  ASM_END()