github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/ia64/add_n_sub_n.asm (about)

     1  dnl  IA-64 mpn_add_n_sub_n -- mpn parallel addition and subtraction.
     2  
     3  dnl  Contributed to the GNU project by Torbjorn Granlund.
     4  
     5  dnl  Copyright 2010 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  C           cycles/limb
    36  C Itanium:      ?
    37  C Itanium 2:    2.25
    38  
    39  C INPUT PARAMETERS
    40  define(`sp', `r32')
    41  define(`dp', `r33')
    42  define(`up', `r34')
    43  define(`vp', `r35')
    44  define(`n',  `r36')
    45  
    46  C Some useful aliases for registers we use
    47  define(`u0',`r16') define(`u1',`r17') define(`u2',`r18') define(`u3',`r19')
    48  define(`v0',`r20') define(`v1',`r21') define(`v2',`r22') define(`v3',`r23')
    49  define(`s0',`r24') define(`s1',`r25') define(`s2',`r26') define(`s3',`r27')
    50  define(`d0',`r28') define(`d1',`r29') define(`d2',`r30') define(`d3',`r31')
    51  define(`up0',`up')
    52  define(`up1',`r14')
    53  define(`vp0',`vp')
    54  define(`vp1',`r15')
    55  
    56  
    57  ASM_START()
    58  PROLOGUE(mpn_add_n_sub_n)
    59  	.prologue
    60  	.save	ar.lc, r2
    61  	.body
    62  ifdef(`HAVE_ABI_32',`
    63  	addp4	sp = 0, sp		C				M I
    64  	addp4	dp = 0, dp		C				M I
    65  	nop.i	0
    66  	addp4	up = 0, up		C				M I
    67  	addp4	vp = 0, vp		C				M I
    68  	zxt4	n = n			C				I
    69  	;;
    70  ')
    71  
    72  	and	r9 = 3, n		C				M I
    73  	mov.i	r2 = ar.lc		C				I0
    74  	add	up1 = 8, up0		C				M I
    75  	add	vp1 = 8, vp0		C				M I
    76  	add	r8 = -2, n		C				M I
    77  	add	r10 = 256, up		C				M I
    78  	;;
    79  	shr.u	r8 = r8, 2		C				I0
    80  	cmp.eq	p10, p0 = 0, r9		C				M I
    81  	cmp.eq	p11, p0 = 2, r9		C				M I
    82  	cmp.eq	p12, p0 = 3, r9		C				M I
    83  	add	r11 = 256, vp		C				M I
    84  	;;
    85  	mov.i	ar.lc = r8		C				I0
    86    (p10)	br	L(b0)			C				B
    87    (p11)	br	L(b2)			C				B
    88    (p12)	br	L(b3)			C				B
    89  
    90  L(b1):	ld8	u3 = [up0], 8		C				M01
    91  	add	up1 = 8, up1		C				M I
    92  	cmpltu	p14, p15 = 4, n		C				M I
    93  	ld8	v3 = [vp0], 8		C				M01
    94  	add	vp1 = 8, vp1		C				M I
    95  	;;
    96  	add	s3 = u3, v3		C				M I
    97  	sub	d3 = u3, v3		C				M I
    98  	mov	r8 = 0			C				M I
    99  	;;
   100  	cmpltu	p9, p0 = s3, v3		C  carry from add3		M I
   101  	cmpltu	p13, p0 = u3, v3	C borrow from sub3		M I
   102    (p15)	br	L(cj1)			C				B
   103  	st8	[sp] = s3, 8		C				M23
   104  	st8	[dp] = d3, 8		C				M23
   105  	br	L(c0)			C				B
   106  
   107  L(b0):	cmp.ne	p9, p0 = r0, r0		C				M I
   108  	cmp.ne	p13, p0 = r0, r0	C				M I
   109  L(c0):	ld8	u0 = [up0], 16		C				M01
   110  	ld8	u1 = [up1], 16		C				M01
   111  	;;
   112  	ld8	v0 = [vp0], 16		C				M01
   113  	ld8	v1 = [vp1], 16		C				M01
   114  	;;
   115  	ld8	u2 = [up0], 16		C				M01
   116  	ld8	u3 = [up1], 16		C				M01
   117  	;;
   118  	ld8	v2 = [vp0], 16		C				M01
   119  	ld8	v3 = [vp1], 16		C				M01
   120  	;;
   121  	add	s0 = u0, v0		C				M I
   122  	add	s1 = u1, v1		C				M I
   123  	sub	d0 = u0, v0		C				M I
   124  	sub	d1 = u1, v1		C				M I
   125  	;;
   126  	cmpltu	p6, p0 = s0, v0		C  carry from add0		M I
   127  	cmpltu	p7, p0 = s1, v1		C  carry from add1		M I
   128  	cmpltu	p10, p0 = u0, v0	C borrow from sub0		M I
   129  	cmpltu	p11, p0 = u1, v1	C borrow from sub1		M I
   130  	;;
   131  	nop	0			C
   132  	br.cloop.dptk	L(top)		C				B
   133  	br	L(end)			C				B
   134  
   135  L(b3):	ld8	u1 = [up0], 8		C				M01
   136  	add	up1 = 8, up1		C				M I
   137  	ld8	v1 = [vp0], 8		C				M01
   138  	;;
   139  	add	vp1 = 8, vp1		C				M I
   140  	add	s1 = u1, v1		C				M I
   141  	sub	d1 = u1, v1		C				M I
   142  	;;
   143  	cmpltu	p7, p0 = s1, v1		C  carry from add1		M I
   144  	cmpltu	p11, p0 = u1, v1	C borrow from sub1		M I
   145  	;;
   146  	st8	[sp] = s1, 8		C				M23
   147  	st8	[dp] = d1, 8		C				M23
   148  	br	L(c2)			C				B
   149  
   150  	ALIGN(32)
   151  L(b2):	cmp.ne	p7, p0 = r0, r0		C				M I
   152  	cmp.ne	p11, p0 = r0, r0	C				M I
   153  	nop	0
   154  L(c2):	ld8	u2 = [up0], 16		C				M01
   155  	ld8	u3 = [up1], 16		C				M01
   156  	cmpltu	p14, p0 = 4, n		C				M I
   157  	;;
   158  	ld8	v2 = [vp0], 16		C				M01
   159  	ld8	v3 = [vp1], 16		C				M01
   160    (p14)	br	L(gt4)			C				B
   161  	;;
   162  	add	s2 = u2, v2		C				M I
   163  	add	s3 = u3, v3		C				M I
   164  	sub	d2 = u2, v2		C				M I
   165  	sub	d3 = u3, v3		C				M I
   166  	;;
   167  	cmpltu	p8, p0 = s2, v2		C  carry from add0		M I
   168  	cmpltu	p9, p0 = s3, v3		C  carry from add3		M I
   169  	cmpltu	p12, p0 = u2, v2	C borrow from sub2		M I
   170  	cmpltu	p13, p0 = u3, v3	C borrow from sub3		M I
   171  	br	L(cj2)			C				B
   172  	;;
   173  L(gt4):	ld8	u0 = [up0], 16		C				M01
   174  	ld8	u1 = [up1], 16		C				M01
   175  	;;
   176  	ld8	v0 = [vp0], 16		C				M01
   177  	ld8	v1 = [vp1], 16		C				M01
   178  	;;
   179  	add	s2 = u2, v2		C				M I
   180  	add	s3 = u3, v3		C				M I
   181  	sub	d2 = u2, v2		C				M I
   182  	sub	d3 = u3, v3		C				M I
   183  	;;
   184  	cmpltu	p8, p0 = s2, v2		C  carry from add0		M I
   185  	cmpltu	p9, p0 = s3, v3		C  carry from add1		M I
   186  	cmpltu	p12, p0 = u2, v2	C borrow from sub0		M I
   187  	cmpltu	p13, p0 = u3, v3	C borrow from sub1		M I
   188  	br.cloop.dptk	L(mid)		C				B
   189  
   190  	ALIGN(32)
   191  L(top):
   192  	ld8	u0 = [up0], 16		C				M01
   193  	ld8	u1 = [up1], 16		C				M01
   194     (p9)	cmpeqor	p6, p0 = -1, s0		C				M I
   195     (p9)	add	s0 = 1, s0		C				M I
   196    (p13)	cmpeqor	p10, p0 = 0, d0		C				M I
   197    (p13)	add	d0 = -1, d0		C				M I
   198  	;;
   199  	ld8	v0 = [vp0], 16		C				M01
   200  	ld8	v1 = [vp1], 16		C				M01
   201     (p6)	cmpeqor	p7, p0 = -1, s1		C				M I
   202     (p6)	add	s1 = 1, s1		C				M I
   203    (p10)	cmpeqor	p11, p0 = 0, d1		C				M I
   204    (p10)	add	d1 = -1, d1		C				M I
   205  	;;
   206  	st8	[sp] = s0, 8		C				M23
   207  	st8	[dp] = d0, 8		C				M23
   208  	add	s2 = u2, v2		C				M I
   209  	add	s3 = u3, v3		C				M I
   210  	sub	d2 = u2, v2		C				M I
   211  	sub	d3 = u3, v3		C				M I
   212  	;;
   213  	st8	[sp] = s1, 8		C				M23
   214  	st8	[dp] = d1, 8		C				M23
   215  	cmpltu	p8, p0 = s2, v2		C  carry from add2		M I
   216  	cmpltu	p9, p0 = s3, v3		C  carry from add3		M I
   217  	cmpltu	p12, p0 = u2, v2	C borrow from sub2		M I
   218  	cmpltu	p13, p0 = u3, v3	C borrow from sub3		M I
   219  	;;
   220  L(mid):
   221  	ld8	u2 = [up0], 16		C				M01
   222  	ld8	u3 = [up1], 16		C				M01
   223     (p7)	cmpeqor	p8, p0 = -1, s2		C				M I
   224     (p7)	add	s2 = 1, s2		C				M I
   225    (p11)	cmpeqor	p12, p0 = 0, d2		C				M I
   226    (p11)	add	d2 = -1, d2		C				M I
   227  	;;
   228  	ld8	v2 = [vp0], 16		C				M01
   229  	ld8	v3 = [vp1], 16		C				M01
   230     (p8)	cmpeqor	p9, p0 = -1, s3		C				M I
   231     (p8)	add	s3 = 1, s3		C				M I
   232    (p12)	cmpeqor	p13, p0 = 0, d3		C				M I
   233    (p12)	add	d3 = -1, d3		C				M I
   234  	;;
   235  	st8	[sp] = s2, 8		C				M23
   236  	st8	[dp] = d2, 8		C				M23
   237  	add	s0 = u0, v0		C				M I
   238  	add	s1 = u1, v1		C				M I
   239  	sub	d0 = u0, v0		C				M I
   240  	sub	d1 = u1, v1		C				M I
   241  	;;
   242  	st8	[sp] = s3, 8		C				M23
   243  	st8	[dp] = d3, 8		C				M23
   244  	cmpltu	p6, p0 = s0, v0		C  carry from add0		M I
   245  	cmpltu	p7, p0 = s1, v1		C  carry from add1		M I
   246  	cmpltu	p10, p0 = u0, v0	C borrow from sub0		M I
   247  	cmpltu	p11, p0 = u1, v1	C borrow from sub1		M I
   248  	;;
   249  	lfetch	[r10], 32		C				M?
   250  	lfetch	[r11], 32		C				M?
   251  	br.cloop.dptk	L(top)		C				B
   252  	;;
   253  
   254  L(end):
   255  	nop	0
   256  	nop	0
   257     (p9)	cmpeqor	p6, p0 = -1, s0		C				M I
   258     (p9)	add	s0 = 1, s0		C				M I
   259    (p13)	cmpeqor	p10, p0 = 0, d0		C				M I
   260    (p13)	add	d0 = -1, d0		C				M I
   261  	;;
   262  	nop	0
   263  	nop	0
   264     (p6)	cmpeqor	p7, p0 = -1, s1		C				M I
   265     (p6)	add	s1 = 1, s1		C				M I
   266    (p10)	cmpeqor	p11, p0 = 0, d1		C				M I
   267    (p10)	add	d1 = -1, d1		C				M I
   268  	;;
   269  	st8	[sp] = s0, 8		C				M23
   270  	st8	[dp] = d0, 8		C				M23
   271  	add	s2 = u2, v2		C				M I
   272  	add	s3 = u3, v3		C				M I
   273  	sub	d2 = u2, v2		C				M I
   274  	sub	d3 = u3, v3		C				M I
   275  	;;
   276  	st8	[sp] = s1, 8		C				M23
   277  	st8	[dp] = d1, 8		C				M23
   278  	cmpltu	p8, p0 = s2, v2		C  carry from add2		M I
   279  	cmpltu	p9, p0 = s3, v3		C  carry from add3		M I
   280  	cmpltu	p12, p0 = u2, v2	C borrow from sub2		M I
   281  	cmpltu	p13, p0 = u3, v3	C borrow from sub3		M I
   282  	;;
   283  L(cj2):
   284     (p7)	cmpeqor	p8, p0 = -1, s2		C				M I
   285     (p7)	add	s2 = 1, s2		C				M I
   286    (p11)	cmpeqor	p12, p0 = 0, d2		C				M I
   287    (p11)	add	d2 = -1, d2		C				M I
   288  	mov	r8 = 0			C				M I
   289  	nop	0
   290  	;;
   291  	st8	[sp] = s2, 8		C				M23
   292  	st8	[dp] = d2, 8		C				M23
   293     (p8)	cmpeqor	p9, p0 = -1, s3		C				M I
   294     (p8)	add	s3 = 1, s3		C				M I
   295    (p12)	cmpeqor	p13, p0 = 0, d3		C				M I
   296    (p12)	add	d3 = -1, d3		C				M I
   297  	;;
   298  L(cj1):
   299     (p9)	mov	r8 = 2			C				M I
   300  	;;
   301  	mov.i	ar.lc = r2		C				I0
   302    (p13)	add	r8 = 1, r8		C				M I
   303  	st8	[sp] = s3		C				M23
   304  	st8	[dp] = d3		C				M23
   305  	br.ret.sptk.many b0		C				B
   306  EPILOGUE()
   307  ASM_END()