github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/ia64/aors_n.asm (about)

     1  dnl  IA-64 mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
     2  
     3  dnl  Contributed to the GNU project by Torbjorn Granlund.
     4  
     5  dnl  Copyright 2003-2005, 2010, 2011 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  C           cycles/limb
    36  C Itanium:      2.67
    37  C Itanium 2:    1.25
    38  
    39  C TODO
    40  C  * Consider using special code for small n, using something like
    41  C    "switch (8 * (n >= 8) + (n mod 8))" to enter it and feed-in code.
    42  C  * The non-nc code was trimmed cycle for cycle to its current state.  It is
    43  C    probably hard to save more that an odd cycle there.  The nc code is much
    44  C    cruder (since tune/speed doesn't have any applicable direct measurements).
    45  C  * Without the nc entry points, this becomes around 1800 bytes of object
    46  C    code; the nc code adds over 1000 bytes.  We should perhaps sacrifice a
    47  C    few cycles for the non-nc code and let it fall into the nc code.
    48  
    49  C INPUT PARAMETERS
    50  define(`rp', `r32')
    51  define(`up', `r33')
    52  define(`vp', `r34')
    53  define(`n',  `r35')
    54  define(`cy', `r36')
    55  
    56  ifdef(`OPERATION_add_n',`
    57    define(ADDSUB,	add)
    58    define(CND,		ltu)
    59    define(INCR,		1)
    60    define(LIM,		-1)
    61    define(LIM2,		0)
    62    define(func,    mpn_add_n)
    63    define(func_nc, mpn_add_nc)
    64  ')
    65  ifdef(`OPERATION_sub_n',`
    66    define(ADDSUB,	sub)
    67    define(CND,		gtu)
    68    define(INCR,		-1)
    69    define(LIM,		0)
    70    define(LIM2,		-1)
    71    define(func,    mpn_sub_n)
    72    define(func_nc, mpn_sub_nc)
    73  ')
    74  
    75  define(PFDIST, 500)
    76  
    77  C Some useful aliases for registers we use
    78  define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17')
    79  define(`v0',`r24') define(`v1',`r25') define(`v2',`r26') define(`v3',`r27')
    80  define(`w0',`r28') define(`w1',`r29') define(`w2',`r30') define(`w3',`r31')
    81  define(`rpx',`r3')
    82  define(`upadv',`r20') define(`vpadv',`r21')
    83  
    84  MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
    85  
    86  ASM_START()
    87  PROLOGUE(func_nc)
    88  	.prologue
    89  	.save	ar.lc, r2
    90  	.body
    91  ifdef(`HAVE_ABI_32',`
    92  		addp4	rp = 0, rp		C			M I
    93  		addp4	up = 0, up		C			M I
    94  		nop.i	0
    95  		addp4	vp = 0, vp		C			M I
    96  		nop.m	0
    97  		zxt4	n = n			C			I
    98  	;;
    99  ')
   100  
   101   {.mmi;		ld8	r11 = [vp], 8		C			M01
   102  		ld8	r10 = [up], 8		C			M01
   103  		mov	r2 = ar.lc		C			I0
   104  }{.mmi;		and	r14 = 7, n		C			M I
   105  		cmp.lt	p15, p14 = 8, n		C			M I
   106  		add	n = -6, n		C			M I
   107  	;;
   108  }{.mmi;		add	upadv = PFDIST, up	C Merging these lines into the feed-in
   109  		add	vpadv = PFDIST, vp	C code could save a cycle per call at
   110  		mov	r23 = cy		C the expense of code size.
   111  	;;
   112  }{.mmi;		cmp.eq	p6, p0 = 1, r14		C			M I
   113  		cmp.eq	p7, p0 = 2, r14		C			M I
   114  		cmp.eq	p8, p0 = 3, r14		C			M I
   115  }{.bbb;	(p6)	br.dptk	.Lc001			C			B
   116  	(p7)	br.dptk	.Lc010			C			B
   117  	(p8)	br.dptk	.Lc011			C			B
   118  	;;
   119  }{.mmi;		cmp.eq	p9, p0 = 4, r14		C			M I
   120  		cmp.eq	p10, p0 = 5, r14	C			M I
   121  		cmp.eq	p11, p0 = 6, r14	C			M I
   122  }{.bbb;	(p9)	br.dptk	.Lc100			C			B
   123  	(p10)	br.dptk	.Lc101			C			B
   124  	(p11)	br.dptk	.Lc110			C			B
   125  	;;
   126  }{.mmi;		ld8	r19 = [vp], 8		C			M01
   127  		ld8	r18 = [up], 8		C			M01
   128  		cmp.ne	p13, p0 = 0, cy		C copy cy to p13	M I
   129  }{.mmb;		cmp.eq	p12, p0 = 7, r14	C			M I
   130  		nop	0
   131  	(p12)	br.dptk	.Lc111			C			B
   132  	;;
   133  }
   134  
   135  .Lc000:
   136   {.mmi;		ld8	v3 = [vp], 8		C			M01
   137  		ld8	u3 = [up], 8		C			M01
   138  		shr.u	n = n, 3		C			I0
   139  	;;
   140  }{.mmi;		add	vpadv = PFDIST, vp	C			M I
   141  		ld8	v0 = [vp], 8		C			M01
   142  		mov	ar.lc = n		C			I0
   143  }{.mmi;		ld8	u0 = [up], 8		C			M01
   144  		ADDSUB	w1 = r10, r11		C			M I
   145  		nop	0
   146  	;;
   147  }{.mmi;		add	upadv = PFDIST, up	C			M I
   148  		ld8	v1 = [vp], 8		C			M01
   149  		cmp.CND	p7, p0 = w1, r10	C			M I
   150  }{.mmi;		ld8	u1 = [up], 8		C			M01
   151  		ADDSUB	w2 = r18, r19		C			M I
   152  		add	rpx = 8, rp		C			M I
   153  	;;
   154  }{.mmi;		ld8	v2 = [vp], 8		C			M01
   155  		cmp.CND	p8, p0 = w2, r18	C			M I
   156  	(p13)	cmpeqor	p7, p0 = LIM, w1	C			M I
   157  }{.mmi;		ld8	u2 = [up], 8		C			M01
   158  	(p13)	add	w1 = INCR, w1		C			M I
   159  		ADDSUB	w3 = u3, v3		C			M I
   160  	;;
   161  }{.mmi;		ld8	v3 = [vp], 8		C			M01
   162  		cmp.CND	p9, p0 = w3, u3		C			M I
   163  	(p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
   164  }{.mmb;		ld8	u3 = [up], 8		C			M01
   165  	(p7)	add	w2 = INCR, w2		C			M I
   166  		br	L(m0)
   167  }
   168  
   169  .Lc001:
   170   {.mmi;	(p15)	ld8	v1 = [vp], 8		C			M01
   171  	(p15)	ld8	u1 = [up], 8		C			M01
   172  		ADDSUB	w0 = r10, r11		C			M I
   173  }{.mmb;		nop	0
   174  		nop	0
   175  	(p15)	br	L(0)
   176  	;;
   177  }{.mmi;		cmp.ne	p9, p0 = 0, r23		C			M I
   178  		mov	r8 = 0
   179  		cmp.CND	p6, p0 = w0, r10	C			M I
   180  	;;
   181  }{.mmb;	(p9)	cmpeqor	p6, p0 = LIM, w0	C			M I
   182  	(p9)	add	w0 = INCR, w0		C			M I
   183  		br	L(cj1)			C			B
   184  }
   185  L(0):
   186   {.mmi;		ld8	v2 = [vp], 8		C			M01
   187  		ld8	u2 = [up], 8		C			M01
   188  		shr.u	n = n, 3		C			I0
   189  	;;
   190  }{.mmi;		ld8	v3 = [vp], 8		C			M01
   191  		ld8	u3 = [up], 8		C			M01
   192  		mov	ar.lc = n		C			I0
   193  }{.mmi;		nop	0
   194  		cmp.ne	p9, p0 = 0, r23		C			M I
   195  		nop	0
   196  	;;
   197  }{.mmi;		ld8	v0 = [vp], 8		C			M01
   198  		cmp.CND	p6, p0 = w0, r10	C			M I
   199  		add	rpx = 16, rp		C			M I
   200  }{.mmb;		ld8	u0 = [up], 8		C			M01
   201  		ADDSUB	w1 = u1, v1		C			M I
   202  		br	L(c1)			C			B
   203  }
   204  
   205  .Lc010:
   206   {.mmi;		ld8	v0 = [vp], 8		C			M01
   207  		ld8	u0 = [up], 8		C			M01
   208  		mov	r8 = 0			C			M I
   209  }{.mmb;		ADDSUB	w3 = r10, r11		C			M I
   210  		cmp.ne	p8, p0 = 0, r23		C			M I
   211  	(p15)	br	L(1)			C			B
   212  	;;
   213  }{.mmi;		cmp.CND	p9, p0 = w3, r10	C			M I
   214  		ADDSUB	w0 = u0, v0		C			M I
   215  	(p8)	add	w3 = INCR, w3		C			M I
   216  	;;
   217  }{.mmb;		cmp.CND	p6, p0 = w0, u0		C			M I
   218  	(p8)	cmpeqor	p9, p0 = LIM2, w3	C			M I
   219  		br	L(cj2)			C			B
   220  }
   221  L(1):
   222   {.mmi;		ld8	v1 = [vp], 8		C			M01
   223  		ld8	u1 = [up], 8		C			M01
   224  		shr.u	n = n, 3		C			I0
   225  	;;
   226  }{.mmi;		ld8	v2 = [vp], 8		C			M01
   227  		ld8	u2 = [up], 8		C			M01
   228  		mov	ar.lc = n		C			I0
   229  	;;
   230  }{.mmi;		ld8	v3 = [vp], 8		C			M01
   231  		ld8	u3 = [up], 8		C			M01
   232  		cmp.CND	p9, p0 = w3, r10	C			M I
   233  	;;
   234  }{.mmi;	(p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
   235  	(p8)	add	w3 = INCR, w3		C			M I
   236  		ADDSUB	w0 = u0, v0		C			M I
   237  }{.mmb;		add	rpx = 24, rp		C			M I
   238  		nop	0
   239  		br	L(m23)			C			B
   240  }
   241  
   242  .Lc011:
   243   {.mmi;		ld8	v3 = [vp], 8		C			M01
   244  		ld8	u3 = [up], 8		C			M01
   245  		shr.u	n = n, 3		C			I0
   246  }{.mmi;		ADDSUB	w2 = r10, r11		C			M I
   247  		cmp.ne	p7, p0 = 0, r23		C			M I
   248  		nop	0
   249  	;;
   250  }{.mmb;		ld8	v0 = [vp], 8		C			M01
   251  		ld8	u0 = [up], 8		C			M01
   252  	(p15)	br	L(2)			C			B
   253  }{.mmi;		cmp.CND	p8, p0 = w2, r10	C			M I
   254  		ADDSUB	w3 = u3, v3		C			M I
   255  		nop	0
   256  	;;
   257  }{.mmb;	(p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
   258  	(p7)	add	w2 = INCR, w2		C			M I
   259  		br	L(cj3)			C			B
   260  }
   261  L(2):
   262   {.mmi;		ld8	v1 = [vp], 8		C			M01
   263  		ld8	u1 = [up], 8		C			M01
   264  		ADDSUB	w3 = u3, v3		C			M I
   265  	;;
   266  }{.mmi;		ld8	v2 = [vp], 8		C			M01
   267  		ld8	u2 = [up], 8		C			M01
   268  		cmp.CND	p8, p0 = w2, r10	C			M I
   269  	;;
   270  }{.mmi;		ld8	v3 = [vp], 8		C			M01
   271  		cmp.CND	p9, p0 = w3, u3		C			M I
   272  		mov	ar.lc = n		C			I0
   273  }{.mmi;		ld8	u3 = [up], 8		C			M01
   274  	(p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
   275  	(p7)	add	w2 = INCR, w2		C			M I
   276  	;;
   277  }{.mmi;		add	rpx = 32, rp		C			M I
   278  		st8	[rp] = w2, 8		C			M23
   279  	(p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
   280  }{.mmb;	(p8)	add	w3 = INCR, w3		C			M I
   281  		ADDSUB	w0 = u0, v0		C			M I
   282  		br	L(m23)
   283  }
   284  
   285  .Lc100:
   286   {.mmi;		ld8	v2 = [vp], 8		C			M01
   287  		ld8	u2 = [up], 8		C			M01
   288  		shr.u	n = n, 3		C			I0
   289  }{.mmi;		ADDSUB	w1 = r10, r11		C			M I
   290  		nop	0
   291  		nop	0
   292  	;;
   293  }{.mmi;		ld8	v3 = [vp], 8		C			M01
   294  		ld8	u3 = [up], 8		C			M01
   295  		add	rpx = 8, rp		C			M I
   296  }{.mmi;		cmp.ne	p6, p0 = 0, r23		C			M I
   297  		cmp.CND	p7, p0 = w1, r10	C			M I
   298  		nop	0
   299  	;;
   300  }{.mmi;		ld8	v0 = [vp], 8		C			M01
   301  		ld8	u0 = [up], 8		C			M01
   302  		ADDSUB	w2 = u2, v2		C			M I
   303  }{.mmb;	(p6)	cmpeqor	p7, p0 = LIM, w1	C			M I
   304  	(p6)	add	w1 = INCR, w1		C			M I
   305  	(p14)	br	L(cj4)
   306  	;;
   307  }{.mmi;		ld8	v1 = [vp], 8		C			M01
   308  		ld8	u1 = [up], 8		C			M01
   309  		mov	ar.lc = n		C			I0
   310  	;;
   311  }{.mmi;		ld8	v2 = [vp], 8		C			M01
   312  		cmp.CND	p8, p0 = w2, u2		C			M I
   313  		nop	0
   314  }{.mmi;		ld8	u2 = [up], 8		C			M01
   315  		nop	0
   316  		ADDSUB	w3 = u3, v3		C			M I
   317  	;;
   318  }{.mmi;		ld8	v3 = [vp], 8		C			M01
   319  		cmp.CND	p9, p0 = w3, u3		C			M I
   320  	(p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
   321  }{.mmb;		ld8	u3 = [up], 8		C			M01
   322  	(p7)	add	w2 = INCR, w2		C			M I
   323  		br	L(m4)
   324  }
   325  
   326  .Lc101:
   327   {.mmi;		ld8	v1 = [vp], 8		C			M01
   328  		ld8	u1 = [up], 8		C			M01
   329  		shr.u	n = n, 3		C			I0
   330  	;;
   331  }{.mmi;		ld8	v2 = [vp], 8		C			M01
   332  		ld8	u2 = [up], 8		C			M01
   333  		mov	ar.lc = n		C			I0
   334  	;;
   335  }{.mmi;		ld8	v3 = [vp], 8		C			M01
   336  		ld8	u3 = [up], 8		C			M01
   337  		ADDSUB	w0 = r10, r11		C			M I
   338  }{.mmi;		cmp.ne	p9, p0 = 0, r23		C			M I
   339  		add	rpx = 16, rp		C			M I
   340  		nop	0
   341  	;;
   342  }{.mmi;		ld8	v0 = [vp], 8		C			M01
   343  		ld8	u0 = [up], 8		C			M01
   344  		cmp.CND	p6, p0 = w0, r10	C			M I
   345  }{.mbb;		ADDSUB	w1 = u1, v1		C			M I
   346  	(p15)	br	L(c5)			C			B
   347  		br	L(end)			C			B
   348  }
   349  
   350  .Lc110:
   351   {.mmi;		ld8	v0 = [vp], 8		C			M01
   352  		ld8	u0 = [up], 8		C			M01
   353  		shr.u	n = n, 3		C			I0
   354  	;;
   355  }{.mmi;		add	upadv = PFDIST, up	C			M I
   356  		add	vpadv = PFDIST, vp	C			M I
   357  		mov	ar.lc = n		C			I0
   358  }{.mmi;		ld8	v1 = [vp], 8		C			M01
   359  		ld8	u1 = [up], 8		C			M01
   360  		ADDSUB	w3 = r10, r11		C			M I
   361  	;;
   362  }{.mmi;		ld8	v2 = [vp], 8		C			M01
   363  		ld8	u2 = [up], 8		C			M01
   364  		ADDSUB	w0 = u0, v0		C			M I
   365  }{.mmi;		cmp.CND	p9, p0 = w3, r10	C			M I
   366  		cmp.ne	p8, p0 = 0, r23		C			M I
   367  		add	rpx = 24, rp		C			M I
   368  	;;
   369  }{.mmi;		ld8	v3 = [vp], 8		C			M01
   370  		ld8	u3 = [up], 8		C			M01
   371  		nop	0
   372  }{.mmb;	(p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
   373  	(p8)	add	w3 = INCR, w3		C			M I
   374  		br	L(m67)			C			B
   375  }
   376  
   377  .Lc111:
   378   {.mmi;		ld8	v0 = [vp], 8		C			M01
   379  		ld8	u0 = [up], 8		C			M01
   380  		shr.u	n = n, 3		C			I0
   381  	;;
   382  }{.mmi;		add	upadv = PFDIST, up	C			M I
   383  		ld8	v1 = [vp], 8		C			M01
   384  		mov	ar.lc = n		C			I0
   385  }{.mmi;		ld8	u1 = [up], 8		C			M01
   386  		ADDSUB	w2 = r10, r11		C			M I
   387  		nop	0
   388  	;;
   389  }{.mmi;		add	vpadv = PFDIST, vp	C			M I
   390  		ld8	v2 = [vp], 8		C			M01
   391  		cmp.CND	p8, p0 = w2, r10	C			M I
   392  }{.mmi;		ld8	u2 = [up], 8		C			M01
   393  		ADDSUB	w3 = r18, r19		C			M I
   394  		nop	0
   395  	;;
   396  }{.mmi;		ld8	v3 = [vp], 8		C			M01
   397  		cmp.CND	p9, p0 = w3, r18	C			M I
   398  	(p13)	cmpeqor	p8, p0 = LIM, w2	C			M I
   399  }{.mmi;		ld8	u3 = [up], 8		C			M01
   400  	(p13)	add	w2 = INCR, w2		C			M I
   401  		nop	0
   402  	;;
   403  }{.mmi;		add	rpx = 32, rp		C			M I
   404  		st8	[rp] = w2, 8		C			M23
   405  	(p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
   406  }{.mmb;	(p8)	add	w3 = INCR, w3		C			M I
   407  		ADDSUB	w0 = u0, v0		C			M I
   408  		br	L(m67)
   409  }
   410  EPILOGUE()
   411  
   412  PROLOGUE(func)
   413  	.prologue
   414  	.save	ar.lc, r2
   415  	.body
   416  ifdef(`HAVE_ABI_32',`
   417  		addp4	rp = 0, rp		C			M I
   418  		addp4	up = 0, up		C			M I
   419  		nop.i	0
   420  		addp4	vp = 0, vp		C			M I
   421  		nop.m	0
   422  		zxt4	n = n			C			I
   423  	;;
   424  ')
   425  
   426   {.mmi;		ld8	r11 = [vp], 8		C			M01
   427  		ld8	r10 = [up], 8		C			M01
   428  		mov	r2 = ar.lc		C			I0
   429  }{.mmi;		and	r14 = 7, n		C			M I
   430  		cmp.lt	p15, p14 = 8, n		C			M I
   431  		add	n = -6, n		C			M I
   432  	;;
   433  }{.mmi;		cmp.eq	p6, p0 = 1, r14		C			M I
   434  		cmp.eq	p7, p0 = 2, r14		C			M I
   435  		cmp.eq	p8, p0 = 3, r14		C			M I
   436  }{.bbb;	(p6)	br.dptk	.Lb001			C			B
   437  	(p7)	br.dptk	.Lb010			C			B
   438  	(p8)	br.dptk	.Lb011			C			B
   439  	;;
   440  }{.mmi;		cmp.eq	p9, p0 = 4, r14		C			M I
   441  		cmp.eq	p10, p0 = 5, r14	C			M I
   442  		cmp.eq	p11, p0 = 6, r14	C			M I
   443  }{.bbb;	(p9)	br.dptk	.Lb100			C			B
   444  	(p10)	br.dptk	.Lb101			C			B
   445  	(p11)	br.dptk	.Lb110			C			B
   446  	;;
   447  }{.mmi;		ld8	r19 = [vp], 8		C			M01
   448  		ld8	r18 = [up], 8		C			M01
   449  		cmp.ne	p13, p0 = r0, r0	C clear "CF"		M I
   450  }{.mmb;		cmp.eq	p12, p0 = 7, r14	C			M I
   451  		mov	r23 = 0			C			M I
   452  	(p12)	br.dptk	.Lb111			C			B
   453  	;;
   454  }
   455  
   456  .Lb000:
   457   {.mmi;		ld8	v3 = [vp], 8		C			M01
   458  		ld8	u3 = [up], 8		C			M01
   459  		shr.u	n = n, 3		C			I0
   460  	;;
   461  }{.mmi;		ld8	v0 = [vp], 8		C			M01
   462  		ld8	u0 = [up], 8		C			M01
   463  		ADDSUB	w1 = r10, r11		C			M I
   464  	;;
   465  }{.mmi;		ld8	v1 = [vp], 8		C			M01
   466  		cmp.CND	p7, p0 = w1, r10	C			M I
   467  		mov	ar.lc = n		C			I0
   468  }{.mmi;		ld8	u1 = [up], 8		C			M01
   469  		ADDSUB	w2 = r18, r19		C			M I
   470  		add	rpx = 8, rp		C			M I
   471  	;;
   472  }{.mmi;		add	upadv = PFDIST, up
   473  		add	vpadv = PFDIST, vp
   474  		cmp.CND	p8, p0 = w2, r18	C			M I
   475  }{.mmi;		ld8	v2 = [vp], 8		C			M01
   476  		ld8	u2 = [up], 8		C			M01
   477  		ADDSUB	w3 = u3, v3		C			M I
   478  	;;
   479  }{.mmi;		ld8	v3 = [vp], 8		C			M01
   480  		cmp.CND	p9, p0 = w3, u3		C			M I
   481  	(p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
   482  }{.mmb;		ld8	u3 = [up], 8		C			M01
   483  	(p7)	add	w2 = INCR, w2		C			M I
   484  		br	L(m0)			C			B
   485  }
   486  
   487  	ALIGN(32)
   488  .Lb001:
   489   {.mmi;		ADDSUB	w0 = r10, r11		C			M I
   490  	(p15)	ld8	v1 = [vp], 8		C			M01
   491  		mov	r8 = 0			C			M I
   492  	;;
   493  }{.mmb;		cmp.CND	p6, p0 = w0, r10	C			M I
   494  	(p15)	ld8	u1 = [up], 8		C			M01
   495  	(p14)	br	L(cj1)			C			B
   496  	;;
   497  }{.mmi;		add	upadv = PFDIST, up
   498  		add	vpadv = PFDIST, vp
   499  		shr.u	n = n, 3		C			I0
   500  }{.mmi;		ld8	v2 = [vp], 8		C			M01
   501  		ld8	u2 = [up], 8		C			M01
   502  		cmp.CND	p6, p0 = w0, r10	C			M I
   503  	;;
   504  }{.mmi;		ld8	v3 = [vp], 8		C			M01
   505  		ld8	u3 = [up], 8		C			M01
   506  		mov	ar.lc = n		C			I0
   507  	;;
   508  }{.mmi;		ld8	v0 = [vp], 8		C			M01
   509  		ld8	u0 = [up], 8		C			M01
   510  		ADDSUB	w1 = u1, v1		C			M I
   511  	;;
   512  }{.mmi;		ld8	v1 = [vp], 8		C			M01
   513  		cmp.CND	p7, p0 = w1, u1		C			M I
   514  		ADDSUB	w2 = u2, v2		C			M I
   515  }{.mmb;		ld8	u1 = [up], 8		C			M01
   516  		add	rpx = 16, rp		C			M I
   517  		br	L(m1)			C			B
   518  }
   519  
   520  	ALIGN(32)
   521  .Lb010:
   522   {.mmi;		ld8	v0 = [vp], 8		C			M01
   523  		ld8	u0 = [up], 8		C			M01
   524  		shr.u	n = n, 3		C			I0
   525  }{.mmb;		ADDSUB	w3 = r10, r11		C			M I
   526  		nop	0
   527  	(p15)	br	L(gt2)			C			B
   528  	;;
   529  }{.mmi;		cmp.CND	p9, p0 = w3, r10	C			M I
   530  		ADDSUB	w0 = u0, v0		C			M I
   531  		mov	r8 = 0			C			M I
   532  	;;
   533  }{.mmb;		nop	0
   534  		cmp.CND	p6, p0 = w0, u0		C			M I
   535  		br	L(cj2)			C			B
   536  }
   537  L(gt2):
   538   {.mmi;		ld8	v1 = [vp], 8		C			M01
   539  		ld8	u1 = [up], 8		C			M01
   540  		nop	0
   541  	;;
   542  }{.mmi;		add	upadv = PFDIST, up
   543  		add	vpadv = PFDIST, vp
   544  		mov	ar.lc = n		C			I0
   545  }{.mmi;		ld8	v2 = [vp], 8		C			M01
   546  		ld8	u2 = [up], 8		C			M01
   547  		nop	0
   548  	;;
   549  }{.mmi;		ld8	v3 = [vp], 8		C			M01
   550  		cmp.CND	p9, p0 = w3, r10	C			M I
   551  		ADDSUB	w0 = u0, v0		C			M I
   552  }{.mmb;		ld8	u3 = [up], 8		C			M01
   553  		add	rpx = 24, rp		C			M I
   554  		br	L(m23)			C			B
   555  }
   556  
   557  	ALIGN(32)
   558  .Lb011:
   559   {.mmi;		ld8	v3 = [vp], 8		C			M01
   560  		ld8	u3 = [up], 8		C			M01
   561  		ADDSUB	w2 = r10, r11		C			M I
   562  	;;
   563  }{.mmb;		ld8	v0 = [vp], 8		C			M01
   564  		ld8	u0 = [up], 8		C			M01
   565  	(p15)	br	L(3)			C			B
   566  }{.mmb;		cmp.CND	p8, p0 = w2, r10	C			M I
   567  		ADDSUB	w3 = u3, v3		C			M I
   568  		br	L(cj3)			C			B
   569  }
   570  L(3):
   571   {.mmi;		ld8	v1 = [vp], 8		C			M01
   572  		ld8	u1 = [up], 8		C			M01
   573  		shr.u	n = n, 3		C			I0
   574  	;;
   575  }{.mmi;		add	upadv = PFDIST, up
   576  		add	vpadv = PFDIST, vp
   577  		ADDSUB	w3 = u3, v3		C			M I
   578  }{.mmi;		ld8	v2 = [vp], 8		C			M01
   579  		ld8	u2 = [up], 8		C			M01
   580  		cmp.CND	p8, p0 = w2, r10	C			M I
   581  	;;
   582  }{.mmi;		ld8	v3 = [vp], 8		C			M01
   583  		cmp.CND	p9, p0 = w3, u3		C			M I
   584  		mov	ar.lc = n		C			I0
   585  }{.mmi;		ld8	u3 = [up], 8		C			M01
   586  		nop	0
   587  		nop	0
   588  	;;
   589  }{.mmi;		add	rpx = 32, rp		C			M I
   590  		st8	[rp] = w2, 8		C			M23
   591  	(p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
   592  }{.mmb;	(p8)	add	w3 = INCR, w3		C			M I
   593  		ADDSUB	w0 = u0, v0		C			M I
   594  		br	L(m23)			C			B
   595  }
   596  
   597  	ALIGN(32)
   598  .Lb100:
   599   {.mmi;		ld8	v2 = [vp], 8		C			M01
   600  		ld8	u2 = [up], 8		C			M01
   601  		shr.u	n = n, 3		C			I0
   602  	;;
   603  }{.mmi;		ld8	v3 = [vp], 8		C			M01
   604  		ld8	u3 = [up], 8		C			M01
   605  		ADDSUB	w1 = r10, r11		C			M I
   606  	;;
   607  }{.mmi;		ld8	v0 = [vp], 8		C			M01
   608  		ld8	u0 = [up], 8		C			M01
   609  		cmp.CND	p7, p0 = w1, r10	C			M I
   610  }{.mmb;		nop	0
   611  		ADDSUB	w2 = u2, v2		C			M I
   612  	(p14)	br	L(cj4)			C			B
   613  	;;
   614  }
   615  L(gt4):
   616   {.mmi;		add	upadv = PFDIST, up
   617  		add	vpadv = PFDIST, vp
   618  		mov	ar.lc = n		C			I0
   619  }{.mmi;		ld8	v1 = [vp], 8		C			M01
   620  		ld8	u1 = [up], 8		C			M01
   621  		nop	0
   622  	;;
   623  }{.mmi;		ld8	v2 = [vp], 8		C			M01
   624  		cmp.CND	p8, p0 = w2, u2		C			M I
   625  		nop	0
   626  }{.mmi;		ld8	u2 = [up], 8		C			M01
   627  		ADDSUB	w3 = u3, v3		C			M I
   628  		add	rpx = 8, rp		C			M I
   629  	;;
   630  }{.mmi;		ld8	v3 = [vp], 8		C			M01
   631  		cmp.CND	p9, p0 = w3, u3		C			M I
   632  	(p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
   633  }{.mmb;		ld8	u3 = [up], 8		C			M01
   634  	(p7)	add	w2 = INCR, w2		C			M I
   635  		br	L(m4)			C			B
   636  }
   637  
   638  	ALIGN(32)
   639  .Lb101:
   640   {.mmi;		ld8	v1 = [vp], 8		C			M01
   641  		ld8	u1 = [up], 8		C			M01
   642  		shr.u	n = n, 3		C			I0
   643  	;;
   644  }{.mmi;		ld8	v2 = [vp], 8		C			M01
   645  		ld8	u2 = [up], 8		C			M01
   646  		ADDSUB	w0 = r10, r11		C			M I
   647  	;;
   648  }{.mmi;		add	upadv = PFDIST, up
   649  		add	vpadv = PFDIST, vp
   650  		add	rpx = 16, rp		C			M I
   651  }{.mmi;		ld8	v3 = [vp], 8		C			M01
   652  		ld8	u3 = [up], 8		C			M01
   653  		nop	0
   654  	;;
   655  }{.mmi;		ld8	v0 = [vp], 8		C			M01
   656  		cmp.CND	p6, p0 = w0, r10	C			M I
   657  		nop	0
   658  }{.mmb;		ld8	u0 = [up], 8		C			M01
   659  		ADDSUB	w1 = u1, v1		C			M I
   660  	(p14)	br	L(cj5)			C			B
   661  	;;
   662  }
   663  L(gt5):
   664   {.mmi;		ld8	v1 = [vp], 8		C			M01
   665  		cmp.CND	p7, p0 = w1, u1		C			M I
   666  		mov	ar.lc = n		C			I0
   667  }{.mmb;		ld8	u1 = [up], 8		C			M01
   668  		ADDSUB	w2 = u2, v2		C			M I
   669  		br	L(m5)			C			B
   670  }
   671  
   672  	ALIGN(32)
   673  .Lb110:
   674   {.mmi;		ld8	v0 = [vp], 8		C			M01
   675  		ld8	u0 = [up], 8		C			M01
   676  		shr.u	n = n, 3		C			I0
   677  	;;
   678  }{.mmi;		ld8	v1 = [vp], 8		C			M01
   679  		ld8	u1 = [up], 8		C			M01
   680  		ADDSUB	w3 = r10, r11		C			M I
   681  	;;
   682  }{.mmi;		add	upadv = PFDIST, up
   683  		add	vpadv = PFDIST, vp
   684  		mov	ar.lc = n		C			I0
   685  }{.mmi;		ld8	v2 = [vp], 8		C			M01
   686  		ld8	u2 = [up], 8		C			M01
   687  		nop	0
   688  	;;
   689  }{.mmi;		ld8	v3 = [vp], 8		C			M01
   690  		cmp.CND	p9, p0 = w3, r10	C			M I
   691  		ADDSUB	w0 = u0, v0		C			M I
   692  }{.mmb;		ld8	u3 = [up], 8		C			M01
   693  		add	rpx = 24, rp		C			M I
   694  		br	L(m67)			C			B
   695  }
   696  
   697  	ALIGN(32)
   698  .Lb111:
   699   {.mmi;		ld8	v0 = [vp], 8		C			M01
   700  		ld8	u0 = [up], 8		C			M01
   701  		shr.u	n = n, 3		C			I0
   702  	;;
   703  }{.mmi;		ld8	v1 = [vp], 8		C			M01
   704  		ld8	u1 = [up], 8		C			M01
   705  		ADDSUB	w2 = r10, r11		C			M I
   706  	;;
   707  }{.mmi;		ld8	v2 = [vp], 8		C			M01
   708  		cmp.CND	p8, p0 = w2, r10	C			M I
   709  		mov	ar.lc = n		C			I0
   710  }{.mmi;		ld8	u2 = [up], 8		C			M01
   711  		ADDSUB	w3 = r18, r19		C			M I
   712  		nop	0
   713  	;;
   714  }{.mmi;		add	upadv = PFDIST, up
   715  		add	vpadv = PFDIST, vp
   716  		nop	0
   717  }{.mmi;		ld8	v3 = [vp], 8		C			M01
   718  		ld8	u3 = [up], 8		C			M01
   719  		cmp.CND	p9, p0 = w3, r18	C			M I
   720  	;;
   721  }{.mmi;		add	rpx = 32, rp		C			M I
   722  		st8	[rp] = w2, 8		C			M23
   723  	(p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
   724  }{.mmb;	(p8)	add	w3 = INCR, w3		C			M I
   725  		ADDSUB	w0 = u0, v0		C			M I
   726  		br	L(m67)			C			B
   727  }
   728  
   729  C *** MAIN LOOP START ***
   730  	ALIGN(32)
   731  L(top):
   732  L(c5):		ld8	v1 = [vp], 8		C			M01
   733  		cmp.CND	p7, p0 = w1, u1		C			M I
   734  	(p9)	cmpeqor	p6, p0 = LIM, w0	C			M I
   735  		ld8	u1 = [up], 8		C			M01
   736  	(p9)	add	w0 = INCR, w0		C			M I
   737  		ADDSUB	w2 = u2, v2		C			M I
   738  	;;
   739  L(m5):		ld8	v2 = [vp], 8		C			M01
   740  		cmp.CND	p8, p0 = w2, u2		C			M I
   741  	(p6)	cmpeqor	p7, p0 = LIM, w1	C			M I
   742  		ld8	u2 = [up], 8		C			M01
   743  	(p6)	add	w1 = INCR, w1		C			M I
   744  		ADDSUB	w3 = u3, v3		C			M I
   745  	;;
   746  		st8	[rp] = w0, 8		C			M23
   747  		ld8	v3 = [vp], 8		C			M01
   748  		cmp.CND	p9, p0 = w3, u3		C			M I
   749  	(p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
   750  		ld8	u3 = [up], 8		C			M01
   751  	(p7)	add	w2 = INCR, w2		C			M I
   752  	;;
   753  L(m4):		st8	[rp] = w1, 16		C			M23
   754  		st8	[rpx] = w2, 32		C			M23
   755  	(p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
   756  		lfetch	[upadv], 64
   757  	(p8)	add	w3 = INCR, w3		C			M I
   758  		ADDSUB	w0 = u0, v0		C			M I
   759  	;;
   760  L(m23):		st8	[rp] = w3, 8		C			M23
   761  		ld8	v0 = [vp], 8		C			M01
   762  		cmp.CND	p6, p0 = w0, u0		C			M I
   763  		ld8	u0 = [up], 8		C			M01
   764  		ADDSUB	w1 = u1, v1		C			M I
   765  		nop.b	0
   766  	;;
   767  L(c1):		ld8	v1 = [vp], 8		C			M01
   768  		cmp.CND	p7, p0 = w1, u1		C			M I
   769  	(p9)	cmpeqor	p6, p0 = LIM, w0	C			M I
   770  		ld8	u1 = [up], 8		C			M01
   771  	(p9)	add	w0 = INCR, w0		C			M I
   772  		ADDSUB	w2 = u2, v2		C			M I
   773  	;;
   774  L(m1):		ld8	v2 = [vp], 8		C			M01
   775  		cmp.CND	p8, p0 = w2, u2		C			M I
   776  	(p6)	cmpeqor	p7, p0 = LIM, w1	C			M I
   777  		ld8	u2 = [up], 8		C			M01
   778  	(p6)	add	w1 = INCR, w1		C			M I
   779  		ADDSUB	w3 = u3, v3		C			M I
   780  	;;
   781  		st8	[rp] = w0, 8		C			M23
   782  		ld8	v3 = [vp], 8		C			M01
   783  		cmp.CND	p9, p0 = w3, u3		C			M I
   784  	(p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
   785  		ld8	u3 = [up], 8		C			M01
   786  	(p7)	add	w2 = INCR, w2		C			M I
   787  	;;
   788  L(m0):		st8	[rp] = w1, 16		C			M23
   789  		st8	[rpx] = w2, 32		C			M23
   790  	(p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
   791  		lfetch	[vpadv], 64
   792  	(p8)	add	w3 = INCR, w3		C			M I
   793  		ADDSUB	w0 = u0, v0		C			M I
   794  	;;
   795  L(m67):		st8	[rp] = w3, 8		C			M23
   796  		ld8	v0 = [vp], 8		C			M01
   797  		cmp.CND	p6, p0 = w0, u0		C			M I
   798  		ld8	u0 = [up], 8		C			M01
   799  		ADDSUB	w1 = u1, v1		C			M I
   800  		br.cloop.dptk	L(top)		C			B
   801  	;;
   802  C *** MAIN LOOP END ***
   803  
   804  L(end):
   805   {.mmi;	(p9)	cmpeqor	p6, p0 = LIM, w0	C			M I
   806  	(p9)	add	w0 = INCR, w0		C			M I
   807  		mov	ar.lc = r2		C			I0
   808  }
   809  L(cj5):
   810   {.mmi;		cmp.CND	p7, p0 = w1, u1		C			M I
   811  		ADDSUB	w2 = u2, v2		C			M I
   812  		nop	0
   813  	;;
   814  }{.mmi;		st8	[rp] = w0, 8		C			M23
   815  	(p6)	cmpeqor	p7, p0 = LIM, w1	C			M I
   816  	(p6)	add	w1 = INCR, w1		C			M I
   817  }
   818  L(cj4):
   819   {.mmi;		cmp.CND	p8, p0 = w2, u2		C			M I
   820  		ADDSUB	w3 = u3, v3		C			M I
   821  		nop	0
   822  	;;
   823  }{.mmi;		st8	[rp] = w1, 8		C			M23
   824  	(p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
   825  	(p7)	add	w2 = INCR, w2		C			M I
   826  }
   827  L(cj3):
   828   {.mmi;		cmp.CND	p9, p0 = w3, u3		C			M I
   829  		ADDSUB	w0 = u0, v0		C			M I
   830  		nop	0
   831  	;;
   832  }{.mmi;		st8	[rp] = w2, 8		C			M23
   833  	(p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
   834  	(p8)	add	w3 = INCR, w3		C			M I
   835  }{.mmi;		cmp.CND	p6, p0 = w0, u0		C			M I
   836  		nop	0
   837  		mov	r8 = 0			C			M I
   838  	;;
   839  }
   840  L(cj2):
   841   {.mmi;		st8	[rp] = w3, 8		C			M23
   842  	(p9)	cmpeqor	p6, p0 = LIM, w0	C			M I
   843  	(p9)	add	w0 = INCR, w0		C			M I
   844  	;;
   845  }
   846  L(cj1):
   847   {.mmb;		st8	[rp] = w0, 8		C			M23
   848  	(p6)	mov	r8 = 1			C			M I
   849  		br.ret.sptk.many b0		C			B
   850  }
   851  EPILOGUE()
   852  ASM_END()