github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/ia64/aorsorrlshC_n.asm (about)

     1  dnl  IA-64 mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n.
     2  
     3  dnl  Contributed to the GNU project by Torbjörn Granlund.
     4  
     5  dnl  Copyright 2003-2005, 2010, 2013 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  C           cycles/limb
    34  C Itanium:      ?
    35  C Itanium 2:    1.5
    36  
    37  C TODO
    38  C  * Use shladd in feed-in code (for mpn_addlshC_n).
    39  C  * Rewrite loop to schedule loads closer to use, since we do prefetch.
    40  
    41  C INPUT PARAMETERS
    42  define(`rp', `r32')
    43  define(`up', `r33')
    44  define(`vp', `r34')
    45  define(`n',  `r35')
    46  
    47  ifdef(`DO_add', `
    48    define(`ADDSUB',     `add	$1 = $2, $3')
    49    define(`CMP',        `cmp.ltu	$1,p0 = $2, $3')
    50    define(`INCR',       1)
    51    define(`LIM',        -1)
    52    define(`func',        mpn_addlsh`'LSH`'_n)')
    53  ifdef(`DO_sub', `
    54    define(`ADDSUB',     `sub	$1 = $2, $3')
    55    define(`CMP',        `cmp.gtu	$1,p0 = $2, $3')
    56    define(`INCR',       -1)
    57    define(`LIM',        0)
    58    define(`func',        mpn_sublsh`'LSH`'_n)')
    59  ifdef(`DO_rsb', `
    60    define(`ADDSUB',     `sub	$1 = $3, $2')
    61    define(`CMP',        `cmp.gtu	$1,p0 = $2, $4')
    62    define(`INCR',       -1)
    63    define(`LIM',        0)
    64    define(`func',        mpn_rsblsh`'LSH`'_n)')
    65  
    66  define(PFDIST, 500)
    67  
    68  define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17')
    69  define(`v0',`r18') define(`v1',`r19') define(`v2',`r20') define(`v3',`r21')
    70  define(`w0',`r22') define(`w1',`r23') define(`w2',`r24') define(`w3',`r25')
    71  define(`s0',`r26') define(`s1',`r27') define(`s2',`r28') define(`s3',`r29')
    72  define(`x0',`r30') define(`x1',`r31') define(`x2',`r3')  define(`x3',`r9')
    73  
    74  C r3 r8 r9 r10 r11
    75  
    76  ASM_START()
    77  PROLOGUE(func)
    78  	.prologue
    79  	.save	ar.lc, r2
    80  	.body
    81  ifdef(`HAVE_ABI_32',`
    82  	addp4	rp = 0, rp		C			M I
    83  	addp4	up = 0, up		C			M I
    84  	nop.i	0
    85  	addp4	vp = 0, vp		C			M I
    86  	nop.m	0
    87  	zxt4	n = n			C			I
    88  	;;
    89  ')
    90   {.mmi;	ld8	r11 = [vp], 8		C			M01
    91  	ld8	r10 = [up], 8		C			M01
    92  	mov.i	r2 = ar.lc		C			I0
    93  }{.mmi;	and	r14 = 3, n		C			M I
    94  	cmp.lt	p15, p0 = 4, n		C			M I
    95  	add	n = -5, n		C			M I
    96  	;;
    97  }{.mmi;	cmp.eq	p6, p0 = 1, r14		C			M I
    98  	cmp.eq	p7, p0 = 2, r14		C			M I
    99  	cmp.eq	p8, p0 = 3, r14		C			M I
   100  }{.bbb
   101    (p6)	br.dptk	.Lb01			C			B
   102    (p7)	br.dptk	.Lb10			C			B
   103    (p8)	br.dptk	.Lb11			C			B
   104  }
   105  
   106  .Lb00:
   107   {.mmi;	ld8	v0 = [vp], 8		C			M01
   108  	ld8	u0 = [up], 8		C			M01
   109  	shr.u	n = n, 2		C			I0
   110  	;;
   111  }{.mmi;	ld8	v1 = [vp], 8		C			M01
   112  	ld8	u1 = [up], 8		C			M01
   113  	shl	x3 = r11, LSH		C			I0
   114  	;;
   115  }{.mmi;	ld8	v2 = [vp], 8		C			M01
   116  	ld8	u2 = [up], 8		C			M01
   117  	shrp	x0 = v0, r11, 64-LSH	C			I0
   118  }{.mmb;	ADDSUB(	w3, r10, x3)		C			M I
   119  	nop	0
   120    (p15)	br.dpnt	.grt4			C			B
   121  	;;
   122  }{.mii;	CMP(	p7, w3, r10, x3)	C			M II0
   123  	shrp	x1 = v1, v0, 64-LSH	C			I0
   124  	ADDSUB(	w0, u0, x0)		C			M I
   125  	;;
   126  }{.mii;	CMP(	p8, w0, u0, x0)		C			M I
   127  	shrp	x2 = v2, v1, 64-LSH	C			I0
   128  	ADDSUB(	w1, u1, x1)		C			M I
   129  }{.mmb;	nop	0
   130  	nop	0
   131  	br	.Lcj4			C			B
   132  }
   133  ALIGN(32)
   134  .grt4:
   135   {.mii;	ld8	v3 = [vp], 8		C			M01
   136  	shrp	x0 = v0, r11, 64-LSH	C			I0
   137  	CMP(	p8, w3, r10, x3)	C			M I
   138  	;;
   139  }{.mmi;	ld8	u3 = [up], 8		C			M01
   140  	add	r11 = PFDIST, vp
   141  	shrp	x1 = v1, v0, 64-LSH	C			I0
   142  }{.mmi;	ld8	v0 = [vp], 8		C			M01
   143  	ADDSUB(	w0, u0, x0)		C			M I
   144  	nop	0
   145  	;;
   146  }{.mmi;	CMP(	p6, w0, u0, x0)		C			M I
   147  	add	r10 = PFDIST, up
   148  	mov.i	ar.lc = n		C			I0
   149  }{.mmb;	ADDSUB(	w1, u1, x1)		C			M I
   150  	ld8	u0 = [up], 8		C			M01
   151  	br	.LL00			C			B
   152  }
   153  
   154  	ALIGN(32)
   155  .Lb01:
   156  ifdef(`DO_add',
   157  `	shladd	w2 = r11, LSH, r10	C			M I
   158  	shr.u	r8 = r11, 64-LSH	C retval		I0
   159    (p15)	br.dpnt	.grt1			C			B
   160  	;;
   161  ',`
   162  	shl	x2 = r11, LSH		C			I0
   163    (p15)	br.dpnt	.grt1			C			B
   164  	;;
   165  	ADDSUB(	w2, r10, x2)		C			M I
   166  	shr.u	r8 = r11, 64-LSH	C retval		I0
   167  	;;
   168  ')
   169  	CMP(	p6, w2, r10, x2)	C			M I
   170  	br		.Lcj1
   171  
   172  .grt1:	ld8	v3 = [vp], 8		C			M01
   173  	ld8	u3 = [up], 8		C			M01
   174  	shr.u	n = n, 2		C			I0
   175  	;;
   176  	ld8	v0 = [vp], 8		C			M01
   177  	ld8	u0 = [up], 8		C			M01
   178  	mov.i	ar.lc = n		C FIXME swap with next	I0
   179  ifdef(`DO_add',
   180  `',`
   181  	ADDSUB(	w2, r10, x2)
   182  ')
   183  	;;
   184   {.mmi;	ld8	v1 = [vp], 8		C			M01
   185  	ld8	u1 = [up], 8		C			M01
   186  	shrp	x3 = v3, r11, 64-LSH	C			I0
   187  	;;
   188  }{.mmi;	ld8	v2 = [vp], 8		C			M01
   189  	ld8	u2 = [up], 8		C			M01
   190  	shrp	x0 = v0, v3, 64-LSH	C			I0
   191  }{.mmb;	CMP(	p6, w2, r10, x2)	C			M I
   192  	ADDSUB(	w3, u3, x3)		C			M I
   193  	br.cloop.dptk	.grt5		C			B
   194  	;;
   195  }{.mmi;	CMP(	p7, w3, u3, x3)		C			M I
   196  	ADDSUB(	w0, u0, x0)		C			M I
   197  	shrp	x1 = v1, v0, 64-LSH	C			I0
   198  }{.mmb;	nop	0
   199  	nop	0
   200  	br	.Lcj5			C			B
   201  }
   202  .grt5:
   203   {.mmi;	add	r10 = PFDIST, up
   204  	add	r11 = PFDIST, vp
   205  	shrp	x0 = v0, v3, 64-LSH	C			I0
   206  }{.mmb;	ld8	v3 = [vp], 8		C			M01
   207  	CMP(	p8, w3, u3, x3)		C			M I
   208  	br	.LL01			C			B
   209  }
   210  	ALIGN(32)
   211  .Lb10:
   212   {.mmi;	ld8	v2 = [vp], 8		C			M01
   213  	ld8	u2 = [up], 8		C			M01
   214  	shl	x1 = r11, LSH		C			I0
   215  }{.mmb;	nop	0
   216  	nop	0
   217    (p15)	br.dpnt	.grt2			C			B
   218  	;;
   219  }{.mmi;	ADDSUB(	w1, r10, x1)		C			M I
   220  	nop	0
   221  	shrp	x2 = v2, r11, 64-LSH	C			I0
   222  	;;
   223  }{.mmi;	CMP(	p9, w1, r10, x1)	C			M I
   224  	ADDSUB(	w2, u2, x2)		C			M I
   225  	shr.u	r8 = v2, 64-LSH		C retval		I0
   226  	;;
   227  }{.mmb;	CMP(	p6, w2, u2, x2)		C			M I
   228  	nop	0
   229  	br	.Lcj2			C			B
   230  }
   231  .grt2:
   232   {.mmi;	ld8	v3 = [vp], 8		C			M01
   233  	ld8	u3 = [up], 8		C			M01
   234  	shr.u	n = n, 2		C			I0
   235  	;;
   236  }{.mmi;	ld8	v0 = [vp], 8		C			M01
   237  	ld8	u0 = [up], 8		C			M01
   238  	mov.i	ar.lc = n		C			I0
   239  }{.mmi;	ADDSUB(	w1, r10, x1)		C			M I
   240  	nop	0
   241  	nop	0
   242  	;;
   243  }{.mii;	ld8	v1 = [vp], 8		C			M01
   244  	shrp	x2 = v2, r11, 64-LSH	C			I0
   245  	CMP(	p8, w1, r10, x1)	C			M I
   246  	;;
   247  }{.mmi;	add	r10 = PFDIST, up
   248  	ld8	u1 = [up], 8		C			M01
   249  	shrp	x3 = v3, v2, 64-LSH	C			I0
   250  }{.mmi;	add	r11 = PFDIST, vp
   251  	ld8	v2 = [vp], 8		C			M01
   252  	ADDSUB(	w2, u2, x2)		C			M I
   253  	;;
   254  }{.mmi;	CMP(	p6, w2, u2, x2)		C			M I
   255  	ld8	u2 = [up], 8		C			M01
   256  	shrp	x0 = v0, v3, 64-LSH	C			I0
   257  }{.mib;	ADDSUB(	w3, u3, x3)		C			M I
   258  	nop	0
   259  	br.cloop.dpnt	L(top)		C			B
   260  }
   261  	br	L(end)			C			B
   262  .Lb11:
   263   {.mmi;	ld8	v1 = [vp], 8		C			M01
   264  	ld8	u1 = [up], 8		C			M01
   265  	shl	x0 = r11, LSH		C			I0
   266  	;;
   267  }{.mmi;	ld8	v2 = [vp], 8		C			M01
   268  	ld8	u2 = [up], 8		C			M01
   269  	shr.u	n = n, 2		C			I0
   270  }{.mmb;	nop	0
   271  	nop	0
   272    (p15)	br.dpnt	.grt3			C			B
   273  	;;
   274  }{.mii;	nop	0
   275  	shrp	x1 = v1, r11, 64-LSH	C			I0
   276  	ADDSUB(	w0, r10, x0)		C			M I
   277  	;;
   278  }{.mii;	CMP(	p8, w0, r10, x0)	C			M I
   279  	shrp	x2 = v2, v1, 64-LSH	C			I0
   280  	ADDSUB(	w1, u1, x1)		C			M I
   281  	;;
   282  }{.mmb;	CMP(	p9, w1, u1, x1)		C			M I
   283  	ADDSUB(	w2, u2, x2)		C			M I
   284  	br	.Lcj3			C			B
   285  }
   286  .grt3:
   287   {.mmi;	ld8	v3 = [vp], 8		C			M01
   288  	ld8	u3 = [up], 8		C			M01
   289  	shrp	x1 = v1, r11, 64-LSH	C			I0
   290  }{.mmi;	ADDSUB(	w0, r10, x0)		C			M I
   291  	nop	0
   292  	nop	0
   293  	;;
   294  }{.mmi;	ld8	v0 = [vp], 8		C			M01
   295  	CMP(	p6, w0, r10, x0)	C			M I
   296  	mov.i	ar.lc = n		C			I0
   297  }{.mmi;	ld8	u0 = [up], 8		C			M01
   298  	ADDSUB(	w1, u1, x1)		C			M I
   299  	nop	0
   300  	;;
   301  }{.mmi;	add	r10 = PFDIST, up
   302  	add	r11 = PFDIST, vp
   303  	shrp	x2 = v2, v1, 64-LSH	C			I0
   304  }{.mmb;	ld8	v1 = [vp], 8		C			M01
   305  	CMP(	p8, w1, u1, x1)		C			M I
   306  	br	.LL11			C			B
   307  }
   308  
   309  C *** MAIN LOOP START ***
   310  	ALIGN(32)
   311  L(top):	st8	[rp] = w1, 8		C			M23
   312  	lfetch	[r10], 32
   313     (p8)	cmpeqor	p6, p0 = LIM, w2	C			M I
   314     (p8)	add	w2 = INCR, w2		C			M I
   315  	ld8	v3 = [vp], 8		C			M01
   316  	CMP(	p8, w3, u3, x3)		C			M I
   317  	;;
   318  .LL01:	ld8	u3 = [up], 8		C			M01
   319  	shrp	x1 = v1, v0, 64-LSH	C			I0
   320     (p6)	cmpeqor	p8, p0 = LIM, w3	C			M I
   321     (p6)	add	w3 = INCR, w3		C			M I
   322  	ld8	v0 = [vp], 8		C			M01
   323  	ADDSUB(	w0, u0, x0)		C			M I
   324  	;;
   325  	st8	[rp] = w2, 8		C			M23
   326  	CMP(	p6, w0, u0, x0)		C			M I
   327  	nop.b	0
   328  	ld8	u0 = [up], 8		C			M01
   329  	lfetch	[r11], 32
   330  	ADDSUB(	w1, u1, x1)		C			M I
   331  	;;
   332  .LL00:	st8	[rp] = w3, 8		C			M23
   333  	shrp	x2 = v2, v1, 64-LSH	C			I0
   334     (p8)	cmpeqor	p6, p0 = LIM, w0	C			M I
   335     (p8)	add	w0 = INCR, w0		C			M I
   336  	ld8	v1 = [vp], 8		C			M01
   337  	CMP(	p8, w1, u1, x1)		C			M I
   338  	;;
   339  .LL11:	ld8	u1 = [up], 8		C			M01
   340  	shrp	x3 = v3, v2, 64-LSH	C			I0
   341     (p6)	cmpeqor	p8, p0 = LIM, w1	C			M I
   342     (p6)	add	w1 = INCR, w1		C			M I
   343  	ld8	v2 = [vp], 8		C			M01
   344  	ADDSUB(	w2, u2, x2)		C			M I
   345  	;;
   346   {.mmi;	st8	[rp] = w0, 8		C			M23
   347  	CMP(	p6, w2, u2, x2)		C			M I
   348  	shrp	x0 = v0, v3, 64-LSH	C			I0
   349  }{.mib;
   350  	ld8	u2 = [up], 8		C			M01
   351  	ADDSUB(	w3, u3, x3)		C			M I
   352  	br.cloop.dptk	L(top)		C			B
   353  	;;
   354  }
   355  C *** MAIN LOOP END ***
   356  
   357  L(end):
   358   {.mmi;	st8	[rp] = w1, 8		C			M23
   359     (p8)	cmpeqor	p6, p0 = LIM, w2	C			M I
   360  	shrp	x1 = v1, v0, 64-LSH	C			I0
   361  }{.mmi;
   362     (p8)	add	w2 = INCR, w2		C			M I
   363  	CMP(	p7, w3, u3, x3)		C			M I
   364  	ADDSUB(	w0, u0, x0)		C			M I
   365  	;;
   366  }
   367  .Lcj5:
   368   {.mmi;	st8	[rp] = w2, 8		C			M23
   369     (p6)	cmpeqor	p7, p0 = LIM, w3	C			M I
   370  	shrp	x2 = v2, v1, 64-LSH	C			I0
   371  }{.mmi;
   372     (p6)	add	w3 = INCR, w3		C			M I
   373  	CMP(	p8, w0, u0, x0)		C			M I
   374  	ADDSUB(	w1, u1, x1)		C			M I
   375  	;;
   376  }
   377  .Lcj4:
   378   {.mmi;	st8	[rp] = w3, 8		C			M23
   379     (p7)	cmpeqor	p8, p0 = LIM, w0	C			M I
   380  	mov.i	ar.lc = r2		C			I0
   381  }{.mmi;
   382     (p7)	add	w0 = INCR, w0		C			M I
   383  	CMP(	p9, w1, u1, x1)		C			M I
   384  	ADDSUB(	w2, u2, x2)		C			M I
   385  	;;
   386  }
   387  .Lcj3:
   388   {.mmi;	st8	[rp] = w0, 8		C			M23
   389     (p8)	cmpeqor	p9, p0 = LIM, w1	C			M I
   390  	shr.u	r8 = v2, 64-LSH		C			I0
   391  }{.mmi;
   392     (p8)	add	w1 = INCR, w1		C			M I
   393  	CMP(	p6, w2, u2, x2)		C			M I
   394  	nop	0
   395  	;;
   396  }
   397  .Lcj2:
   398   {.mmi;	st8	[rp] = w1, 8		C			M23
   399     (p9)	cmpeqor	p6, p0 = LIM, w2	C			M I
   400     (p9)	add	w2 = INCR, w2		C			M I
   401  	;;
   402  }
   403  .Lcj1:
   404   {.mmb;	st8	[rp] = w2		C			M23
   405  ifdef(`DO_rsb',`
   406     (p6)	add	r8 = -1, r8		C			M I
   407  ',`
   408     (p6)	add	r8 = 1, r8		C			M I
   409  ')	br.ret.sptk.many b0		C			B
   410  }
   411  EPILOGUE()
   412  ASM_END()