github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/ia64/rsh1aors_n.asm (about)

     1  dnl  IA-64 mpn_rsh1add_n/mpn_rsh1sub_n -- rp[] = (up[] +- vp[]) >> 1.
     2  
     3  dnl  Contributed to the GNU project by Torbjorn Granlund.
     4  
     5  dnl  Copyright 2003-2005 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  C         cycles/limb
    36  C Itanium:    2.5
    37  C Itanium 2:  1.5
    38  
    39  C TODO
    40  C  * Rewrite function entry code using aorslsh1_n.asm style.
    41  C  * Micro-optimize feed-in and wind-down code.
    42  
    43  C INPUT PARAMETERS
    44  define(`rp',`r32')
    45  define(`up',`r33')
    46  define(`vp',`r34')
    47  define(`n',`r35')
    48  
    49  ifdef(`OPERATION_rsh1add_n',`
    50    define(ADDSUB,       add)
    51    define(PRED,	       ltu)
    52    define(INCR,	       1)
    53    define(LIM,	       -1)
    54    define(func, mpn_rsh1add_n)
    55  ')
    56  ifdef(`OPERATION_rsh1sub_n',`
    57    define(ADDSUB,       sub)
    58    define(PRED,	       gtu)
    59    define(INCR,	       -1)
    60    define(LIM,	       0)
    61    define(func, mpn_rsh1sub_n)
    62  ')
    63  
    64  C Some useful aliases for registers we use
    65  define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17')
    66  define(`v0',`r18') define(`v1',`r19') define(`v2',`r20') define(`v3',`r21')
    67  define(`w0',`r22') define(`w1',`r23') define(`w2',`r24') define(`w3',`r25')
    68  define(`x0',`r26') define(`x1',`r9') define(`x2',`r30') define(`x3',`r31')
    69  
    70  MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n)
    71  
    72  ASM_START()
    73  PROLOGUE(func)
    74  	.prologue
    75  	.save	ar.lc, r2
    76  	.body
    77  ifdef(`HAVE_ABI_32',`
    78  	addp4		rp = 0, rp		C			M I
    79  	addp4		up = 0, up		C			M I
    80  	addp4		vp = 0, vp		C			M I
    81  	nop.m		0
    82  	nop.m		0
    83  	zxt4		n = n			C			I
    84  	;;
    85  ')
    86   {.mmi;	ld8		r11 = [vp], 8		C			M01
    87  	ld8		r10 = [up], 8		C			M01
    88  	mov.i		r2 = ar.lc		C			I0
    89  }{.mmi;	and		r14 = 3, n		C			M I
    90  	cmp.lt		p15, p0 = 4, n		C			M I
    91  	add		n = -4, n		C			M I
    92  	;;
    93  }{.mmi;	cmp.eq		p6, p0 = 1, r14		C			M I
    94  	cmp.eq		p7, p0 = 2, r14		C			M I
    95  	cmp.eq		p8, p0 = 3, r14		C			M I
    96  }{.bbb
    97    (p6)	br.dptk		.Lb01			C			B
    98    (p7)	br.dptk		.Lb10			C			B
    99    (p8)	br.dptk		.Lb11			C			B
   100  }
   101  
   102  .Lb00:	ld8		v0 = [vp], 8		C			M01
   103  	ld8		u0 = [up], 8		C			M01
   104  	shr.u		n = n, 2		C			I0
   105  	;;
   106  	ld8		v1 = [vp], 8		C			M01
   107  	ld8		u1 = [up], 8		C			M01
   108  	ADDSUB		w3 = r10, r11		C			M I
   109  	;;
   110  	ld8		v2 = [vp], 8		C			M01
   111  	ld8		u2 = [up], 8		C			M01
   112    (p15)	br.dpnt		.grt4			C			B
   113  	;;
   114  
   115  	cmp.PRED	p7, p0 = w3, r10	C			M I
   116  	and		r8 = 1, w3		C			M I
   117  	ADDSUB		w0 = u0, v0		C			M I
   118  	;;
   119  	cmp.PRED	p8, p0 = w0, u0		C			M I
   120  	ADDSUB		w1 = u1, v1		C			M I
   121  	;;
   122  	cmp.PRED	p9, p0 = w1, u1		C			M I
   123     (p7)	cmp.eq.or	p8, p0 = LIM, w0	C			M I
   124     (p7)	add		w0 = INCR, w0		C			M I
   125  	;;
   126  	shrp		x3 = w0, w3, 1		C			I0
   127  	ADDSUB		w2 = u2, v2		C			M I
   128     (p8)	cmp.eq.or	p9, p0 = LIM, w1	C			M I
   129     (p8)	add		w1 = INCR, w1		C			M I
   130  	br		.Lcj4			C			B
   131  
   132  .grt4:	ld8		v3 = [vp], 8		C			M01
   133  	cmp.PRED	p7, p0 = w3, r10	C			M I
   134  	ld8		u3 = [up], 8		C			M01
   135  	and		r8 = 1, w3		C			M I
   136  	;;
   137  	ADDSUB		w0 = u0, v0		C			M I
   138  	ld8		v0 = [vp], 8		C			M01
   139  	add		n = -1, n
   140  	;;
   141  	cmp.PRED	p8, p0 = w0, u0		C			M I
   142  	ld8		u0 = [up], 8		C			M01
   143  	ADDSUB		w1 = u1, v1		C			M I
   144  	;;
   145  	ld8		v1 = [vp], 8		C			M01
   146  	mov.i		ar.lc = n		C			I0
   147  	cmp.PRED	p9, p0 = w1, u1		C			M I
   148  	ld8		u1 = [up], 8		C			M01
   149     (p7)	cmp.eq.or	p8, p0 = LIM, w0	C			M I
   150     (p7)	add		w0 = INCR, w0		C			M I
   151  	;;
   152  	ADDSUB		w2 = u2, v2		C			M I
   153  	ld8		v2 = [vp], 8		C			M01
   154  	shrp		x3 = w0, w3, 1		C			I0
   155     (p8)	cmp.eq.or	p9, p0 = LIM, w1	C			M I
   156     (p8)	add		w1 = INCR, w1		C			M I
   157  	br		.LL00			C			B
   158  
   159  
   160  .Lb01:	ADDSUB		w2 = r10, r11		C			M I
   161  	shr.u		n = n, 2		C			I0
   162    (p15)	br.dpnt		.grt1			C			B
   163  	;;
   164  
   165  	cmp.PRED	p6, p7 = w2, r10	C			M I
   166  	shr.u		x2 = w2, 1		C			I0
   167  	and		r8 = 1, w2		C			M I
   168  	;;
   169     (p6)	dep		x2 = -1, x2, 63, 1	C			I0
   170  	br		.Lcj1			C			B
   171  
   172  .grt1:	ld8		v3 = [vp], 8		C			M01
   173  	ld8		u3 = [up], 8		C			M01
   174  	;;
   175  	ld8		v0 = [vp], 8		C			M01
   176  	ld8		u0 = [up], 8		C			M01
   177  	mov.i		ar.lc = n		C FIXME swap with next	I0
   178  	;;
   179  	ld8		v1 = [vp], 8		C			M01
   180  	ld8		u1 = [up], 8		C			M01
   181  	;;
   182  	ld8		v2 = [vp], 8		C			M01
   183  	ld8		u2 = [up], 8		C			M01
   184  	cmp.PRED	p6, p0 = w2, r10	C			M I
   185  	and		r8 = 1, w2		C			M I
   186  	ADDSUB		w3 = u3, v3		C			M I
   187  	br.cloop.dptk	.grt5			C			B
   188  	;;
   189  
   190  	cmp.PRED	p7, p0 = w3, u3		C			M I
   191  	;;
   192  	ADDSUB		w0 = u0, v0		C			M I
   193     (p6)	cmp.eq.or	p7, p0 = LIM, w3	C			M I
   194     (p6)	add		w3 = INCR, w3		C			M I
   195  	;;
   196  	cmp.PRED	p8, p0 = w0, u0		C			M I
   197  	shrp		x2 = w3, w2, 1		C			I0
   198  	ADDSUB		w1 = u1, v1		C			M I
   199  	;;
   200  	cmp.PRED	p9, p0 = w1, u1		C			M I
   201     (p7)	cmp.eq.or	p8, p0 = LIM, w0	C			M I
   202     (p7)	add		w0 = INCR, w0		C			M I
   203  	br		.Lcj5			C			B
   204  
   205  .grt5:	ld8		v3 = [vp], 8		C			M01
   206  	cmp.PRED	p7, p0 = w3, u3		C			M I
   207  	ld8		u3 = [up], 8		C			M01
   208  	;;
   209  	ADDSUB		w0 = u0, v0		C			M I
   210  	ld8		v0 = [vp], 8		C			M01
   211     (p6)	cmp.eq.or	p7, p0 = LIM, w3	C			M I
   212     (p6)	add		w3 = INCR, w3		C			M I
   213  	;;
   214  	cmp.PRED	p8, p0 = w0, u0		C			M I
   215  	shrp		x2 = w3, w2, 1		C			I0
   216  	ld8		u0 = [up], 8		C			M01
   217  	ADDSUB		w1 = u1, v1		C			M I
   218  	;;
   219  	ld8		v1 = [vp], 8		C			M01
   220  	cmp.PRED	p9, p0 = w1, u1		C			M I
   221  	ld8		u1 = [up], 8		C			M01
   222     (p7)	cmp.eq.or	p8, p0 = LIM, w0	C			M I
   223     (p7)	add		w0 = INCR, w0		C			M I
   224  	br		.LL01			C			B
   225  
   226  
   227  .Lb10:	ld8		v2 = [vp], 8		C			M01
   228  	ld8		u2 = [up], 8		C			M01
   229  	shr.u		n = n, 2		C			I0
   230  	ADDSUB		w1 = r10, r11		C			M I
   231    (p15)	br.dpnt		.grt2			C			B
   232  	;;
   233  
   234  	cmp.PRED	p9, p0 = w1, r10	C			M I
   235  	and		r8 = 1, w1		C			M I
   236  	ADDSUB		w2 = u2, v2		C			M I
   237  	;;
   238  	cmp.PRED	p6, p0 = w2, u2		C			M I
   239  	;;
   240     (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
   241     (p9)	add		w2 = INCR, w2		C			M I
   242  	;;
   243  	shrp		x1 = w2, w1, 1		C			I0
   244  	shr.u		x2 = w2, 1		C			I0
   245  	br		.Lcj2			C			B
   246  
   247  .grt2:	ld8		v3 = [vp], 8		C			M01
   248  	ld8		u3 = [up], 8		C			M01
   249  	;;
   250  	ld8		v0 = [vp], 8		C			M01
   251  	ld8		u0 = [up], 8		C			M01
   252  	mov.i		ar.lc = n		C			I0
   253  	;;
   254  	ld8		v1 = [vp], 8		C			M01
   255  	cmp.PRED	p9, p0 = w1, r10	C			M I
   256  	ld8		u1 = [up], 8		C			M01
   257  	and		r8 = 1, w1		C			M I
   258  	;;
   259  	ADDSUB		w2 = u2, v2		C			M I
   260  	ld8		v2 = [vp], 8		C			M01
   261  	;;
   262  	cmp.PRED	p6, p0 = w2, u2		C			M I
   263  	ld8		u2 = [up], 8		C			M01
   264  	ADDSUB		w3 = u3, v3		C			M I
   265  	br.cloop.dptk	.grt6			C			B
   266  	;;
   267  
   268  	cmp.PRED	p7, p0 = w3, u3		C			M I
   269     (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
   270     (p9)	add		w2 = INCR, w2		C			M I
   271  	;;
   272  	shrp		x1 = w2, w1, 1		C			I0
   273  	ADDSUB		w0 = u0, v0		C			M I
   274     (p6)	cmp.eq.or	p7, p0 = LIM, w3	C			M I
   275     (p6)	add		w3 = INCR, w3		C			M I
   276  	br		.Lcj6			C			B
   277  
   278  .grt6:	ld8		v3 = [vp], 8		C			M01
   279  	cmp.PRED	p7, p0 = w3, u3		C			M I
   280  	ld8		u3 = [up], 8		C			M01
   281     (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
   282     (p9)	add		w2 = INCR, w2		C			M I
   283  	;;
   284  	shrp		x1 = w2, w1, 1		C			I0
   285  	ADDSUB		w0 = u0, v0		C			M I
   286  	ld8		v0 = [vp], 8		C			M01
   287     (p6)	cmp.eq.or	p7, p0 = LIM, w3	C			M I
   288     (p6)	add		w3 = INCR, w3		C			M I
   289  	br		.LL10			C			B
   290  
   291  
   292  .Lb11:	ld8		v1 = [vp], 8		C			M01
   293  	ld8		u1 = [up], 8		C			M01
   294  	shr.u		n = n, 2		C			I0
   295  	;;
   296  	ld8		v2 = [vp], 8		C			M01
   297  	ld8		u2 = [up], 8		C			M01
   298  	ADDSUB		w0 = r10, r11		C			M I
   299    (p15)	br.dpnt		.grt3			C			B
   300  	;;
   301  
   302  	cmp.PRED	p8, p0 = w0, r10	C			M I
   303  	ADDSUB		w1 = u1, v1		C			M I
   304  	and		r8 = 1, w0		C			M I
   305  	;;
   306  	cmp.PRED	p9, p0 = w1, u1		C			M I
   307  	;;
   308  	ADDSUB		w2 = u2, v2		C			M I
   309     (p8)	cmp.eq.or	p9, p0 = LIM, w1	C			M I
   310     (p8)	add		w1 = INCR, w1		C			M I
   311  	;;
   312  	cmp.PRED	p6, p0 = w2, u2		C			M I
   313  	shrp		x0 = w1, w0, 1		C			I0
   314  	;;
   315     (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
   316     (p9)	add		w2 = INCR, w2		C			M I
   317  	br		.Lcj3			C			B
   318  
   319  .grt3:	ld8		v3 = [vp], 8		C			M01
   320  	ld8		u3 = [up], 8		C			M01
   321  	;;
   322  	ld8		v0 = [vp], 8		C			M01
   323  	mov.i		ar.lc = n		C			I0
   324  	cmp.PRED	p8, p0 = w0, r10	C			M I
   325  	ld8		u0 = [up], 8		C			M01
   326  	ADDSUB		w1 = u1, v1		C			M I
   327  	and		r8 = 1, w0		C			M I
   328  	;;
   329  	ld8		v1 = [vp], 8		C			M01
   330  	cmp.PRED	p9, p0 = w1, u1		C			M I
   331  	ld8		u1 = [up], 8		C			M01
   332  	;;
   333  	ADDSUB		w2 = u2, v2		C			M I
   334  	ld8		v2 = [vp], 8		C			M01
   335     (p8)	cmp.eq.or	p9, p0 = LIM, w1	C			M I
   336     (p8)	add		w1 = INCR, w1		C			M I
   337  	;;
   338  	cmp.PRED	p6, p0 = w2, u2		C			M I
   339  	shrp		x0 = w1, w0, 1		C			I0
   340  	ld8		u2 = [up], 8		C			M01
   341  	ADDSUB		w3 = u3, v3		C			M I
   342  	br.cloop.dptk	.grt7			C			B
   343  	;;
   344  
   345  	cmp.PRED	p7, p0 = w3, u3		C			M I
   346     (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
   347     (p9)	add		w2 = INCR, w2		C			M I
   348  	br		.Lcj7			C			B
   349  
   350  .grt7:	ld8		v3 = [vp], 8		C			M01
   351  	cmp.PRED	p7, p0 = w3, u3		C			M I
   352  	ld8		u3 = [up], 8		C			M01
   353     (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
   354     (p9)	add		w2 = INCR, w2		C			M I
   355  	br		.LL11			C			B
   356  
   357  
   358  C *** MAIN LOOP START ***
   359  	ALIGN(32)
   360  .Loop:	st8		[rp] = x3, 8		C			M23
   361  	ld8		v3 = [vp], 8		C			M01
   362  	cmp.PRED	p7, p0 = w3, u3		C			M I
   363  	ld8		u3 = [up], 8		C			M01
   364     (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
   365     (p9)	add		w2 = INCR, w2		C			M I
   366  	;;
   367  .LL11:	st8		[rp] = x0, 8		C			M23
   368  	shrp		x1 = w2, w1, 1		C			I0
   369  	ADDSUB		w0 = u0, v0		C			M I
   370  	ld8		v0 = [vp], 8		C			M01
   371     (p6)	cmp.eq.or	p7, p0 = LIM, w3	C			M I
   372     (p6)	add		w3 = INCR, w3		C			M I
   373  	;;
   374  .LL10:	cmp.PRED	p8, p0 = w0, u0		C			M I
   375  	shrp		x2 = w3, w2, 1		C			I0
   376  	nop.b		0
   377  	ld8		u0 = [up], 8		C			M01
   378  	ADDSUB		w1 = u1, v1		C			M I
   379  	nop.b		0
   380  	;;
   381  	st8		[rp] = x1, 8		C			M23
   382  	ld8		v1 = [vp], 8		C			M01
   383  	cmp.PRED	p9, p0 = w1, u1		C			M I
   384  	ld8		u1 = [up], 8		C			M01
   385     (p7)	cmp.eq.or	p8, p0 = LIM, w0	C			M I
   386     (p7)	add		w0 = INCR, w0		C			M I
   387  	;;
   388  .LL01:	st8		[rp] = x2, 8		C			M23
   389  	shrp		x3 = w0, w3, 1		C			I0
   390  	ADDSUB		w2 = u2, v2		C			M I
   391  	ld8		v2 = [vp], 8		C			M01
   392     (p8)	cmp.eq.or	p9, p0 = LIM, w1	C			M I
   393     (p8)	add		w1 = INCR, w1		C			M I
   394  	;;
   395  .LL00:	cmp.PRED	p6, p0 = w2, u2		C			M I
   396  	shrp		x0 = w1, w0, 1		C			I0
   397  	nop.b		0
   398  	ld8		u2 = [up], 8		C			M01
   399  	ADDSUB		w3 = u3, v3		C			M I
   400  	br.cloop.dptk	.Loop			C			B
   401  	;;
   402  C *** MAIN LOOP END ***
   403  
   404  .Lskip:	st8		[rp] = x3, 8		C			M23
   405  	cmp.PRED	p7, p0 = w3, u3		C			M I
   406     (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
   407     (p9)	add		w2 = INCR, w2		C			M I
   408  	;;
   409  .Lcj7:	st8		[rp] = x0, 8		C			M23
   410  	shrp		x1 = w2, w1, 1		C			I0
   411  	ADDSUB		w0 = u0, v0		C			M I
   412     (p6)	cmp.eq.or	p7, p0 = LIM, w3	C			M I
   413     (p6)	add		w3 = INCR, w3		C			M I
   414  	;;
   415  .Lcj6:	cmp.PRED	p8, p0 = w0, u0		C			M I
   416  	shrp		x2 = w3, w2, 1		C			I0
   417  	ADDSUB		w1 = u1, v1		C			M I
   418  	;;
   419  	st8		[rp] = x1, 8		C			M23
   420  	cmp.PRED	p9, p0 = w1, u1		C			M I
   421     (p7)	cmp.eq.or	p8, p0 = LIM, w0	C			M I
   422     (p7)	add		w0 = INCR, w0		C			M I
   423  	;;
   424  .Lcj5:	st8		[rp] = x2, 8		C			M23
   425  	shrp		x3 = w0, w3, 1		C			I0
   426  	ADDSUB		w2 = u2, v2		C			M I
   427     (p8)	cmp.eq.or	p9, p0 = LIM, w1	C			M I
   428     (p8)	add		w1 = INCR, w1		C			M I
   429  	;;
   430  .Lcj4:	cmp.PRED	p6, p0 = w2, u2		C			M I
   431  	shrp		x0 = w1, w0, 1		C			I0
   432  	;;
   433  	st8		[rp] = x3, 8		C			M23
   434     (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
   435     (p9)	add		w2 = INCR, w2		C			M I
   436  	;;
   437  .Lcj3:	st8		[rp] = x0, 8		C			M23
   438  	shrp		x1 = w2, w1, 1		C			I0
   439  	shr.u		x2 = w2, 1		C			I0
   440  	;;
   441  .Lcj2:	st8		[rp] = x1, 8		C			M23
   442     (p6)	dep		x2 = -1, x2, 63, 1	C			I0
   443  	;;
   444  .Lcj1:	st8		[rp] = x2		C			M23
   445  	mov.i		ar.lc = r2		C			I0
   446  	br.ret.sptk.many b0			C			B
   447  EPILOGUE()