github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/ia64/divrem_2.asm (about)

     1  dnl  IA-64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
     2  
     3  dnl  Copyright 2010, 2013 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  C               norm   frac
    34  C itanium 1
    35  C itanium 2     29     29
    36  
    37  
    38  C TODO
    39  C  * Inline and interleave limb inversion code with loop setup code.
    40  C  * We should use explicit bundling in much of the code, since it typically
    41  C    cuts some cycles with the GNU assembler.
    42  
    43  
    44  ASM_START()
    45  
    46  C HP's assembler requires these declarations for importing mpn_invert_limb
    47  	.global	mpn_invert_limb
    48  	.type	mpn_invert_limb,@function
    49  
    50  C INPUT PARAMETERS
    51  C qp   = r32
    52  C fn   = r33
    53  C np   = r34
    54  C nn   = r35
    55  C dp   = r36
    56  
    57  define(`f0x1', `f15')
    58  
    59  ASM_START()
    60  PROLOGUE(mpn_divrem_2)
    61  	.prologue
    62  ifdef(`HAVE_ABI_32',
    63  `	addp4		r32 = 0, r32		C M I
    64  	addp4		r34 = 0, r34		C M I
    65  	zxt4		r35 = r35		C I
    66  	addp4		r36 = 0, r36		C M I
    67  	nop.m		0
    68  	zxt4		r33 = r33		C I
    69  	;;
    70  ')
    71  	.save ar.pfs, r42
    72  	alloc	 r42 = ar.pfs, 5, 9, 1, 0
    73  	shladd	 r34 = r35, 3, r34
    74  	adds	 r14 = 8, r36
    75  	mov	 r43 = r1
    76  	;;
    77  	adds	 r15 = -8, r34
    78  	ld8	 r39 = [r14]
    79  	.save ar.lc, r45
    80  	mov	 r45 = ar.lc
    81  	adds	 r14 = -16, r34
    82  	mov	 r40 = r0
    83  	adds	 r34 = -24, r34
    84  	;;
    85  	ld8	 r38 = [r15]
    86  	.save rp, r41
    87  	mov	 r41 = b0
    88  	.body
    89  	ld8	 r36 = [r36]
    90  	ld8	 r37 = [r14]
    91  	;;
    92  	cmp.gtu	 p6, p7 = r39, r38
    93    (p6)	br.cond.dptk .L8
    94  	;;
    95  	cmp.leu	 p8, p9 = r36, r37
    96  	cmp.geu	 p6, p7 = r39, r38
    97  	;;
    98    (p8)	cmp4.ne.and.orcm p6, p7 = 0, r0
    99    (p7)	br.cond.dptk .L51
   100  .L8:
   101  	add	 r14 = r33, r35		// un + fn
   102  	mov	 r46 = r39		// argument to mpn_invert_limb
   103  	;;
   104  	adds	 r35 = -3, r14
   105  	;;
   106  	cmp.gt	 p12, p0 = r0, r35
   107    (p12)	br.cond.dpnt L(end)
   108  	br.call.sptk.many b0 = mpn_invert_limb
   109  	;;
   110  	setf.sig f11 = r8		// di (non-final)
   111  	setf.sig f34 = r39		// d1
   112  	setf.sig f33 = r36		// d0
   113  	mov	 r1 = r43
   114  	;;
   115  	mov	 r17 = 1
   116  	setf.sig f9 = r38		// n2
   117  	xma.l	 f6 = f11, f34, f0	// t0 = LO(di * d1)
   118  	;;
   119  	setf.sig f10 = r37		// n1
   120  	setf.sig f15 = r17		// 1
   121  	xma.hu	 f8 = f11, f33, f0	// s0 = HI(di * d0)
   122  	;;
   123  	getf.sig r17 = f6
   124  	getf.sig r16 = f8
   125  	mov	 ar.lc = r35
   126  	;;
   127  	sub	 r18 = r0, r39		// -d1
   128  	add	 r14 = r17, r36
   129  	;;
   130  	setf.sig f14 = r18		// -d1
   131  	cmp.leu	 p8, p9 = r17, r14
   132  	add	 r16 = r14, r16
   133  	;;
   134    (p9)	adds	 r19 = 0, r0
   135    (p8)	adds	 r19 = -1, r0
   136  	cmp.gtu	 p6, p7 = r14, r16
   137  	;;
   138    (p6)	adds	 r19 = 1, r19
   139  	;;
   140  ifelse(1,1,`
   141  	cmp.gt	 p7, p6 = r0, r19
   142  	;;
   143    (p6)	adds	 r8 = -1, r8		// di--
   144    (p6)	sub	 r14 = r16, r39		// t0 -= d1
   145    (p6)	cmp.ltu	 p6, p7 = r16, r39	// cy for: t0 - d1
   146  	;;
   147    (p6)	cmp.gt	 p9, p8 = 1, r19
   148    (p7)	cmp.gt	 p9, p8 = 0, r19
   149    (p6)	adds	 r19 = -1, r19		// t1 -= cy
   150  	mov	 r16 = r14
   151  	;;
   152    (p8)	adds	 r8 = -1, r8		// di--
   153    (p8)	sub	 r14 = r16, r39		// t0 -= d1
   154    (p8)	cmp.ltu	 p8, p9 = r16, r39	// cy for: t0 - d1
   155  	;;
   156    (p8)	cmp.gt	 p7, p6 = 1, r19
   157    (p9)	cmp.gt	 p7, p6 = 0, r19
   158    (p8)	adds	 r19 = -1, r19		// t1 -= cy
   159  	mov	 r16 = r14
   160  	;;
   161    (p6)	adds	 r8 = -1, r8		// di--
   162    (p6)	sub	 r14 = r16, r39		// t0 -= d1
   163    (p6)	cmp.ltu	 p6, p7 = r16, r39	// cy for: t0 - d1
   164  	;;
   165    (p6)	cmp.gt	 p9, p8 = 1, r19
   166    (p7)	cmp.gt	 p9, p8 = 0, r19
   167    (p6)	adds	 r19 = -1, r19		// t1 -= cy
   168  	mov	 r16 = r14
   169  	;;
   170    (p8)	adds	 r8 = -1, r8		// di--
   171    (p8)	sub	 r14 = r16, r39		// t0 -= d1
   172    (p8)	cmp.ltu	 p8, p9 = r16, r39	// cy for: t0 - d1
   173  	;;
   174    (p8)	adds	 r19 = -1, r19		// t1 -= cy
   175  	mov	 r16 = r14
   176  ',`
   177  	cmp.gt	 p8, p9 = r0, r19
   178    (p8)	br.cond.dpnt .L46
   179  .L52:
   180  	cmp.leu	 p6, p7 = r39, r16
   181  	sub	 r14 = r16, r39
   182  	adds	 r8 = -1, r8
   183  	;;
   184    (p7)	adds	 r19 = -1, r19
   185  	mov	 r16 = r14
   186  	;;
   187    (p7)	cmp.gt	 p8, p9 = r0, r19
   188    (p9)	br.cond.dptk .L52
   189  .L46:
   190  ')
   191  	setf.sig f32 = r8		// di
   192  	shladd	 r32 = r35, 3, r32
   193  	;;
   194  
   195  	ALIGN(16)
   196  L(top):	nop 0
   197  	nop 0
   198  	cmp.gt	 p8, p9 = r33, r35
   199  	;;
   200   (p8)	mov	 r37 = r0
   201   (p9)	ld8	 r37 = [r34], -8
   202  	xma.hu	 f8 = f9, f32, f10	//				0,29
   203  	xma.l	 f12 = f9, f32, f10	//				0
   204  	;;
   205  	getf.sig r20 = f12		// q0				4
   206  	xma.l	 f13 = f15, f8, f9	// q += n2			4
   207  	sub	 r8 = -1, r36		// bitnot d0
   208  	;;
   209  	getf.sig r18 = f13		//				8
   210  	xma.l	 f7 = f14, f13, f10	//				8
   211  	xma.l	 f6 = f33, f13, f33	// t0 = LO(d0*q+d0)		8
   212  	xma.hu	 f9 = f33, f13, f33	// t1 = HI(d0*q+d0)		9
   213  	;;
   214  	getf.sig r38 = f7		// n1				12
   215  	getf.sig r16 = f6		//				13
   216  	getf.sig r19 = f9		//				14
   217  	;;
   218  	sub	 r38 = r38, r39		// n1 -= d1			17
   219  	;;
   220  	cmp.ne	 p9, p0 = r0, r0	// clear p9
   221  	cmp.leu	 p10, p11 = r16, r37	// cy for: n0 - t0		18
   222  	;;
   223  	sub	 r37 = r37, r16		// n0 -= t0			19
   224    (p11)	sub	 r38 = r38, r19, 1	// n1 -= t1 - cy		19
   225    (p10)	sub	 r38 = r38, r19		// n1 -= t1			19
   226  	;;
   227  	cmp.gtu	 p6, p7 = r20, r38	// n1 >= q0			20
   228  	;;
   229    (p7)	cmp.ltu	 p9, p0 = r8, r37	//				21
   230    (p6)	add	 r18 = 1, r18		//
   231    (p7)	add	 r37 = r37, r36		//				21
   232    (p7)	add	 r38 = r38, r39		//				21
   233  	;;
   234  	setf.sig f10 = r37		// n1				22
   235    (p9)	add	 r38 = 1, r38		//				22
   236  	;;
   237  	setf.sig f9 = r38		// n2				23
   238  	cmp.gtu	 p6, p7 = r39, r38	//				23
   239    (p7)	br.cond.spnt L(fix)
   240  L(bck):	st8	 [r32] = r18, -8
   241  	adds	 r35 = -1, r35
   242  	br.cloop.sptk.few L(top)
   243  	;;
   244  
   245  L(end):	add	r14 = 8, r34
   246  	add	r15 = 16, r34
   247  	mov	 b0 = r41
   248  	;;
   249  	st8	[r14] = r37
   250  	st8	[r15] = r38
   251  	mov	 ar.pfs = r42
   252  	mov	 r8 = r40
   253  	mov	 ar.lc = r45
   254  	br.ret.sptk.many b0
   255  	;;
   256  .L51:
   257  	.pred.rel "mutex", p8, p9
   258  	sub	 r37 = r37, r36
   259    (p9)	sub	 r38 = r38, r39, 1
   260    (p8)	sub	 r38 = r38, r39
   261  	adds	 r40 = 1, r0
   262  	br .L8
   263  	;;
   264  
   265  L(fix):	cmp.geu	 p6, p7 = r39, r38
   266  	cmp.leu	 p8, p9 = r36, r37
   267  	;;
   268    (p8)	cmp4.ne.and.orcm p6, p7 = 0, r0
   269    (p6)	br.cond.dptk L(bck)
   270  	sub	 r37 = r37, r36
   271    (p9)	sub	 r38 = r38, r39, 1
   272    (p8)	sub	 r38 = r38, r39
   273  	adds	 r18 = 1, r18
   274  	;;
   275  	setf.sig f9 = r38		// n2
   276  	setf.sig f10 = r37		// n1
   277  	br	 L(bck)
   278  
   279  EPILOGUE()
   280  ASM_END()