github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/ia64/addmul_2.asm (about)

     1  dnl  IA-64 mpn_addmul_2 -- Multiply a n-limb number with a 2-limb number and
     2  dnl  add the result to a (n+1)-limb number.
     3  
     4  dnl  Contributed to the GNU project by Torbjorn Granlund.
     5  
     6  dnl  Copyright 2004, 2005, 2011 Free Software Foundation, Inc.
     7  
     8  dnl  This file is part of the GNU MP Library.
     9  dnl
    10  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    11  dnl  it under the terms of either:
    12  dnl
    13  dnl    * the GNU Lesser General Public License as published by the Free
    14  dnl      Software Foundation; either version 3 of the License, or (at your
    15  dnl      option) any later version.
    16  dnl
    17  dnl  or
    18  dnl
    19  dnl    * the GNU General Public License as published by the Free Software
    20  dnl      Foundation; either version 2 of the License, or (at your option) any
    21  dnl      later version.
    22  dnl
    23  dnl  or both in parallel, as here.
    24  dnl
    25  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    26  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    27  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    28  dnl  for more details.
    29  dnl
    30  dnl  You should have received copies of the GNU General Public License and the
    31  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    32  dnl  see https://www.gnu.org/licenses/.
    33  
    34  include(`../config.m4')
    35  
    36  C         cycles/limb
    37  C Itanium:    3.65
    38  C Itanium 2:  1.625
    39  
    40  C TODO
    41  C  * Clean up variable names, and try to decrease the number of distinct
    42  C    registers used.
    43  C  * Clean up feed-in code to not require zeroing several registers.
    44  C  * Make sure we don't depend on uninitialised predicate registers.
    45  C  * Could perhaps save a few cycles by using 1 c/l carry propagation in
    46  C    wind-down code.
    47  C  * Ultimately rewrite.  The problem with this code is that it first uses a
    48  C    loaded u value in one xma pair, then leaves it live over several unrelated
    49  C    xma pairs, before it uses it again.  It should actually be quite possible
    50  C    to just swap some aligned xma pairs around.  But we should then schedule
    51  C    u loads further from the first use.
    52  
    53  C INPUT PARAMETERS
    54  define(`rp',`r32')
    55  define(`up',`r33')
    56  define(`n',`r34')
    57  define(`vp',`r35')
    58  
    59  define(`srp',`r3')
    60  
    61  define(`v0',`f6')
    62  define(`v1',`f7')
    63  
    64  define(`s0',`r14')
    65  define(`acc0',`r15')
    66  
    67  define(`pr0_0',`r16') define(`pr0_1',`r17')
    68  define(`pr0_2',`r18') define(`pr0_3',`r19')
    69  
    70  define(`pr1_0',`r20') define(`pr1_1',`r21')
    71  define(`pr1_2',`r22') define(`pr1_3',`r23')
    72  
    73  define(`acc1_0',`r24') define(`acc1_1',`r25')
    74  define(`acc1_2',`r26') define(`acc1_3',`r27')
    75  
    76  dnl define(`',`r28')
    77  dnl define(`',`r29')
    78  dnl define(`',`r30')
    79  dnl define(`',`r31')
    80  
    81  define(`fp0b_0',`f8') define(`fp0b_1',`f9')
    82  define(`fp0b_2',`f10') define(`fp0b_3',`f11')
    83  
    84  define(`fp1a_0',`f12') define(`fp1a_1',`f13')
    85  define(`fp1a_2',`f14') define(`fp1a_3',`f15')
    86  
    87  define(`fp1b_0',`f32') define(`fp1b_1',`f33')
    88  define(`fp1b_2',`f34') define(`fp1b_3',`f35')
    89  
    90  define(`fp2a_0',`f36') define(`fp2a_1',`f37')
    91  define(`fp2a_2',`f38') define(`fp2a_3',`f39')
    92  
    93  define(`r_0',`f40') define(`r_1',`f41')
    94  define(`r_2',`f42') define(`r_3',`f43')
    95  
    96  define(`u_0',`f44') define(`u_1',`f45')
    97  define(`u_2',`f46') define(`u_3',`f47')
    98  
    99  define(`rx',`f48')
   100  define(`ux',`f49')
   101  define(`ry',`f50')
   102  define(`uy',`f51')
   103  
   104  ASM_START()
   105  PROLOGUE(mpn_addmul_2s)
   106  	.prologue
   107  	.save	ar.lc, r2
   108  	.body
   109  
   110  ifdef(`HAVE_ABI_32',`
   111   {.mmi;		addp4	rp = 0, rp		C			M I
   112  		addp4	up = 0, up		C			M I
   113  		addp4	vp = 0, vp		C			M I
   114  }{.mmi;		nop	1
   115  		nop	1
   116  		zxt4	n = n			C			I
   117  	;;
   118  }')
   119  
   120   {.mmi;		ldf8	ux = [up], 8		C			M
   121  		ldf8	v0 = [vp], 8		C			M
   122  		mov	r2 = ar.lc		C			I0
   123  }{.mmi;		ldf8	rx = [rp], 8		C			M
   124  		and	r14 = 3, n		C			M I
   125  		add	n = -2, n		C			M I
   126  	;;
   127  }{.mmi;		ldf8	uy = [up], 8		C			M
   128  		ldf8	v1 = [vp]		C			M
   129  		shr.u	n = n, 2		C			I0
   130  }{.mmi;		ldf8	ry = [rp], -8		C			M
   131  		cmp.eq	p14, p0 = 1, r14	C			M I
   132  		cmp.eq	p11, p0 = 2, r14	C			M I
   133  	;;
   134  }{.mmi;		add	srp = 16, rp		C			M I
   135  		cmp.eq	p15, p0 = 3, r14	C			M I
   136  		mov	ar.lc = n		C			I0
   137  }{.bbb;	(p14)	br.dptk	L(x01)			C			B
   138  	(p11)	br.dptk	L(x10)			C			B
   139  	(p15)	br.dptk	L(x11)			C			B
   140  	;;
   141  }
   142  L(x00):		cmp.ne	p6, p0 = r0, r0		C suppress initial xma pair
   143  		mov	fp2a_3 = f0
   144  		br	L(b00)
   145  L(x01):		cmp.ne	p14, p0 = r0, r0	C suppress initial xma pair
   146  		mov	fp2a_2 = f0
   147  		br	L(b01)
   148  L(x10):		cmp.ne	p11, p0 = r0, r0	C suppress initial xma pair
   149  		mov	fp2a_1 = f0
   150  		br	L(b10)
   151  L(x11):		cmp.ne	p15, p0 = r0, r0	C suppress initial xma pair
   152  		mov	fp2a_0 = f0
   153  		br	L(b11)
   154  
   155  EPILOGUE()
   156  
   157  PROLOGUE(mpn_addmul_2)
   158  	.prologue
   159  	.save	ar.lc, r2
   160  	.body
   161  
   162  ifdef(`HAVE_ABI_32',`
   163   {.mmi;		addp4	rp = 0, rp		C			M I
   164  		addp4	up = 0, up		C			M I
   165  		addp4	vp = 0, vp		C			M I
   166  }{.mmi;		nop	1
   167  		nop	1
   168  		zxt4	n = n			C			I
   169  	;;
   170  }')
   171  
   172   {.mmi;		ldf8	ux = [up], 8		C			M
   173  		ldf8	v0 = [vp], 8		C			M
   174  		mov	r2 = ar.lc		C			I0
   175  }{.mmi;		ldf8	rx = [rp], 8		C			M
   176  		and	r14 = 3, n		C			M I
   177  		add	n = -2, n		C			M I
   178  	;;
   179  }{.mmi;		ldf8	uy = [up], 8		C			M
   180  		ldf8	v1 = [vp]		C			M
   181  		shr.u	n = n, 2		C			I0
   182  }{.mmi;		ldf8	ry = [rp], -8		C			M
   183  		cmp.eq	p14, p0 = 1, r14	C			M I
   184  		cmp.eq	p11, p0 = 2, r14	C			M I
   185  	;;
   186  }{.mmi;		add	srp = 16, rp		C			M I
   187  		cmp.eq	p15, p6 = 3, r14	C			M I
   188  		mov	ar.lc = n		C			I0
   189  }{.bbb;	(p14)	br.dptk	L(b01)			C			B
   190  	(p11)	br.dptk	L(b10)			C			B
   191  	(p15)	br.dptk	L(b11)			C			B
   192  	;;
   193  }
   194  	ALIGN(32)
   195  L(b00):
   196   {.mmi;		ldf8	r_1 = [srp], 8
   197  		ldf8	u_1 = [up], 8
   198  		mov	acc1_2 = 0
   199  }{.mmi;		mov	pr1_2 = 0
   200  		mov	pr0_3 = 0
   201  		cmp.ne	p8, p9 = r0, r0
   202  	;;
   203  }{.mfi;		ldf8	r_2 = [srp], 8
   204  		xma.l	fp0b_3 = ux, v0, rx
   205  		cmp.ne	p12, p13 = r0, r0
   206  }{.mfb;		ldf8	u_2 = [up], 8
   207  		xma.hu	fp1b_3 = ux, v0, rx
   208  		br.cloop.dptk	L(gt4)
   209  }
   210  		xma.l	fp0b_0 = uy, v0, ry
   211  		xma.hu	fp1a_0 = uy, v0, ry
   212  	;;
   213  		getfsig	acc0 = fp0b_3
   214  	(p6)	xma.hu	fp2a_3 = ux, v1, fp1b_3		C suppressed for addmul_2s
   215  	(p6)	xma.l	fp1b_3 = ux, v1, fp1b_3		C suppressed for addmul_2s
   216  	;;
   217  		xma.l	fp0b_1 = u_1, v0, r_1
   218  		xma.hu	fp1a_1 = u_1, v0, r_1
   219  	;;
   220  		getfsig	pr0_0 = fp0b_0
   221  		xma.l	fp1b_0 = uy, v1, fp1a_0
   222  		xma.hu	fp2a_0 = uy, v1, fp1a_0
   223  	;;
   224  		getfsig	pr1_3 = fp1b_3
   225  		getfsig	acc1_3 = fp2a_3
   226  		xma.l	fp0b_2 = u_2, v0, r_2
   227  		xma.hu	fp1a_2 = u_2, v0, r_2
   228  		br	L(cj4)
   229  
   230  L(gt4):		xma.l	fp0b_0 = uy, v0, ry
   231  		xma.hu	fp1a_0 = uy, v0, ry
   232  	;;
   233  		ldf8	r_3 = [srp], 8
   234  		getfsig	acc0 = fp0b_3
   235  	(p6)	xma.hu	fp2a_3 = ux, v1, fp1b_3		C suppressed for addmul_2s
   236  		ldf8	u_3 = [up], 8
   237  	(p6)	xma.l	fp1b_3 = ux, v1, fp1b_3		C suppressed for addmul_2s
   238  	;;
   239  		xma.l	fp0b_1 = u_1, v0, r_1
   240  		xma.hu	fp1a_1 = u_1, v0, r_1
   241  	;;
   242  		ldf8	r_0 = [srp], 8
   243  		getfsig	pr0_0 = fp0b_0
   244  		xma.l	fp1b_0 = uy, v1, fp1a_0
   245  		xma.hu	fp2a_0 = uy, v1, fp1a_0
   246  	;;
   247  		ldf8	u_0 = [up], 8
   248  		getfsig	pr1_3 = fp1b_3
   249  		xma.l	fp0b_2 = u_2, v0, r_2
   250  	;;
   251  		getfsig	acc1_3 = fp2a_3
   252  		xma.hu	fp1a_2 = u_2, v0, r_2
   253  		br	L(00)
   254  
   255  
   256  	ALIGN(32)
   257  L(b01):
   258   {.mmi;		ldf8	r_0 = [srp], 8		C M
   259  		ldf8	u_0 = [up], 8		C M
   260  		mov	acc1_1 = 0		C M I
   261  }{.mmi;		mov	pr1_1 = 0		C M I
   262  		mov	pr0_2 = 0		C M I
   263  		cmp.ne	p6, p7 = r0, r0		C M I
   264  	;;
   265  }{.mfi;		ldf8	r_1 = [srp], 8		C M
   266  		xma.l	fp0b_2 = ux, v0, rx	C F
   267  		cmp.ne	p10, p11 = r0, r0	C M I
   268  }{.mfi;		ldf8	u_1 = [up], 8		C M
   269  		xma.hu	fp1b_2 = ux, v0, rx	C F
   270  		nop	1
   271  	;;
   272  }		xma.l	fp0b_3 = uy, v0, ry	C F
   273  		xma.hu	fp1a_3 = uy, v0, ry	C F
   274  	;;
   275   {.mmf;		getfsig	acc0 = fp0b_2		C M
   276  		ldf8	r_2 = [srp], 8		C M
   277  	(p14)	xma.hu	fp2a_2 = ux, v1,fp1b_2	C F	suppressed for addmul_2s
   278  }{.mfb;		ldf8	u_2 = [up], 8		C M
   279  	(p14)	xma.l	fp1b_2 = ux, v1,fp1b_2	C F	suppressed for addmul_2s
   280  		br.cloop.dptk	L(gt5)
   281  }
   282  		xma.l	fp0b_0 = u_0, v0, r_0	C F
   283  		xma.hu	fp1a_0 = u_0, v0, r_0	C F
   284  	;;
   285  		getfsig	pr0_3 = fp0b_3		C M
   286  		xma.l	fp1b_3 = uy, v1,fp1a_3	C F
   287  		xma.hu	fp2a_3 = uy, v1,fp1a_3	C F
   288  	;;
   289  		getfsig	pr1_2 = fp1b_2		C M
   290  		getfsig	acc1_2 = fp2a_2		C M
   291  		xma.l	fp0b_1 = u_1, v0, r_1	C F
   292  		xma.hu	fp1a_1 = u_1, v0, r_1	C F
   293  		br	L(cj5)
   294  
   295  L(gt5):		xma.l	fp0b_0 = u_0, v0, r_0
   296  		xma.hu	fp1a_0 = u_0, v0, r_0
   297  	;;
   298  		getfsig	pr0_3 = fp0b_3
   299  		ldf8	r_3 = [srp], 8
   300  		xma.l	fp1b_3 = uy, v1, fp1a_3
   301  		xma.hu	fp2a_3 = uy, v1, fp1a_3
   302  	;;
   303  		ldf8	u_3 = [up], 8
   304  		getfsig	pr1_2 = fp1b_2
   305  		xma.l	fp0b_1 = u_1, v0, r_1
   306  	;;
   307  		getfsig	acc1_2 = fp2a_2
   308  		xma.hu	fp1a_1 = u_1, v0, r_1
   309  		br	L(01)
   310  
   311  
   312  	ALIGN(32)
   313  L(b10):		br.cloop.dptk	L(gt2)
   314  		xma.l	fp0b_1 = ux, v0, rx
   315  		xma.hu	fp1b_1 = ux, v0, rx
   316  	;;
   317  		xma.l	fp0b_2 = uy, v0, ry
   318  		xma.hu	fp1a_2 = uy, v0, ry
   319  	;;
   320  		stf8	[rp] = fp0b_1, 8
   321  	(p11)	xma.hu	fp2a_1 = ux, v1, fp1b_1		C suppressed for addmul_2s
   322  	(p11)	xma.l	fp1b_1 = ux, v1, fp1b_1		C suppressed for addmul_2s
   323  	;;
   324  		getfsig	acc0 = fp0b_2
   325  		xma.l	fp1b_2 = uy, v1, fp1a_2
   326  		xma.hu	fp2a_2 = uy, v1, fp1a_2
   327  	;;
   328  		getfsig	pr1_1 = fp1b_1
   329  		getfsig	acc1_1 = fp2a_1
   330  		mov	ar.lc = r2
   331  		getfsig	pr1_2 = fp1b_2
   332  		getfsig	r8 = fp2a_2
   333  	;;
   334  		add	s0 = pr1_1, acc0
   335  	;;
   336  		st8	[rp] = s0, 8
   337  		cmp.ltu	p8, p9 = s0, pr1_1
   338  		sub	r31 = -1, acc1_1
   339  	;;
   340  	.pred.rel "mutex", p8, p9
   341  	(p8)	add	acc0 = pr1_2, acc1_1, 1
   342  	(p9)	add	acc0 = pr1_2, acc1_1
   343  	(p8)	cmp.leu	p10, p0 = r31, pr1_2
   344  	(p9)	cmp.ltu	p10, p0 = r31, pr1_2
   345  	;;
   346  		st8	[rp] = acc0, 8
   347  	(p10)	add	r8 = 1, r8
   348  		br.ret.sptk.many b0
   349  
   350  
   351  L(gt2):
   352   {.mmi;		ldf8	r_3 = [srp], 8
   353  		ldf8	u_3 = [up], 8
   354  		mov	acc1_0 = 0
   355  	;;
   356  }{.mfi;		ldf8	r_0 = [srp], 8
   357  		xma.l	fp0b_1 = ux, v0, rx
   358  		mov	pr1_0 = 0
   359  }{.mfi;		ldf8	u_0 = [up], 8
   360  		xma.hu	fp1b_1 = ux, v0, rx
   361  		mov	pr0_1 = 0
   362  	;;
   363  }		xma.l	fp0b_2 = uy, v0, ry
   364  		xma.hu	fp1a_2 = uy, v0, ry
   365  	;;
   366  		getfsig	acc0 = fp0b_1
   367  		ldf8	r_1 = [srp], 8
   368  	(p11)	xma.hu	fp2a_1 = ux, v1, fp1b_1		C suppressed for addmul_2s
   369  	(p11)	xma.l	fp1b_1 = ux, v1, fp1b_1		C suppressed for addmul_2s
   370  	;;
   371  		ldf8	u_1 = [up], 8
   372  		xma.l	fp0b_3 = u_3, v0, r_3
   373  		xma.hu	fp1a_3 = u_3, v0, r_3
   374  	;;
   375  		getfsig	pr0_2 = fp0b_2
   376  		ldf8	r_2 = [srp], 8
   377  		xma.l	fp1b_2 = uy, v1, fp1a_2
   378  		xma.hu	fp2a_2 = uy, v1, fp1a_2
   379  	;;
   380  		ldf8	u_2 = [up], 8
   381  		getfsig	pr1_1 = fp1b_1
   382  	;;
   383   {.mfi;		getfsig	acc1_1 = fp2a_1
   384  		xma.l	fp0b_0 = u_0, v0, r_0
   385  		cmp.ne	p8, p9 = r0, r0
   386  }{.mfb;		cmp.ne	p12, p13 = r0, r0
   387  		xma.hu	fp1a_0 = u_0, v0, r_0
   388  		br.cloop.sptk.clr	L(top)
   389  }
   390  		br.many	L(end)
   391  
   392  
   393  	ALIGN(32)
   394  L(b11):		ldf8	r_2 = [srp], 8
   395  		mov	pr1_3 = 0
   396  		mov	pr0_0 = 0
   397  	;;
   398  		ldf8	u_2 = [up], 8
   399  		mov	acc1_3 = 0
   400  		br.cloop.dptk	L(gt3)
   401  	;;
   402  		cmp.ne	p6, p7 = r0, r0
   403  		xma.l	fp0b_0 = ux, v0, rx
   404  		xma.hu	fp1b_0 = ux, v0, rx
   405  	;;
   406  		cmp.ne	p10, p11 = r0, r0
   407  		xma.l	fp0b_1 = uy, v0, ry
   408  		xma.hu	fp1a_1 = uy, v0, ry
   409  	;;
   410  		getfsig	acc0 = fp0b_0
   411  	(p15)	xma.hu	fp2a_0 = ux, v1, fp1b_0		C suppressed for addmul_2s
   412  	(p15)	xma.l	fp1b_0 = ux, v1, fp1b_0		C suppressed for addmul_2s
   413  	;;
   414  		xma.l	fp0b_2 = uy, v1, r_2
   415  		xma.hu	fp1a_2 = uy, v1, r_2
   416  	;;
   417  		getfsig	pr0_1 = fp0b_1
   418  		xma.l	fp1b_1 = u_2, v0, fp1a_1
   419  		xma.hu	fp2a_1 = u_2, v0, fp1a_1
   420  	;;
   421  		getfsig	pr1_0 = fp1b_0
   422  		getfsig	acc1_0 = fp2a_0
   423  		br	L(cj3)
   424  
   425  L(gt3):		ldf8	r_3 = [srp], 8
   426  		xma.l	fp0b_0 = ux, v0, rx
   427  		cmp.ne	p10, p11 = r0, r0
   428  		ldf8	u_3 = [up], 8
   429  		xma.hu	fp1b_0 = ux, v0, rx
   430  		cmp.ne	p6, p7 = r0, r0
   431  	;;
   432  		xma.l	fp0b_1 = uy, v0, ry
   433  		xma.hu	fp1a_1 = uy, v0, ry
   434  	;;
   435  		getfsig	acc0 = fp0b_0
   436  		ldf8	r_0 = [srp], 8
   437  	(p15)	xma.hu	fp2a_0 = ux, v1, fp1b_0		C suppressed for addmul_2s
   438  		ldf8	u_0 = [up], 8
   439  	(p15)	xma.l	fp1b_0 = ux, v1, fp1b_0		C suppressed for addmul_2s
   440  	;;
   441  		xma.l	fp0b_2 = u_2, v0, r_2
   442  		xma.hu	fp1a_2 = u_2, v0, r_2
   443  	;;
   444  		getfsig	pr0_1 = fp0b_1
   445  		ldf8	r_1 = [srp], 8
   446  		xma.l	fp1b_1 = uy, v1, fp1a_1
   447  		xma.hu	fp2a_1 = uy, v1, fp1a_1
   448  	;;
   449  		ldf8	u_1 = [up], 8
   450  		getfsig	pr1_0 = fp1b_0
   451  	;;
   452  		getfsig	acc1_0 = fp2a_0
   453  		xma.l	fp0b_3 = u_3, v0, r_3
   454  		xma.hu	fp1a_3 = u_3, v0, r_3
   455  		br	L(11)
   456  
   457  
   458  C *** MAIN LOOP START ***
   459  	ALIGN(32)
   460  L(top):						C 00
   461  	.pred.rel "mutex", p12, p13
   462  		getfsig	pr0_3 = fp0b_3
   463  		ldf8	r_3 = [srp], 8
   464  		xma.l	fp1b_3 = u_3, v1, fp1a_3
   465  	(p12)	add	s0 = pr1_0, acc0, 1
   466  	(p13)	add	s0 = pr1_0, acc0
   467  		xma.hu	fp2a_3 = u_3, v1, fp1a_3
   468  	;;					C 01
   469  	.pred.rel "mutex", p8, p9
   470  	.pred.rel "mutex", p12, p13
   471  		ldf8	u_3 = [up], 8
   472  		getfsig	pr1_2 = fp1b_2
   473  	(p8)	cmp.leu	p6, p7 = acc0, pr0_1
   474  	(p9)	cmp.ltu	p6, p7 = acc0, pr0_1
   475  	(p12)	cmp.leu	p10, p11 = s0, pr1_0
   476  	(p13)	cmp.ltu	p10, p11 = s0, pr1_0
   477  	;;					C 02
   478  	.pred.rel "mutex", p6, p7
   479  		getfsig	acc1_2 = fp2a_2
   480  		st8	[rp] = s0, 8
   481  		xma.l	fp0b_1 = u_1, v0, r_1
   482  	(p6)	add	acc0 = pr0_2, acc1_0, 1
   483  	(p7)	add	acc0 = pr0_2, acc1_0
   484  		xma.hu	fp1a_1 = u_1, v0, r_1
   485  	;;					C 03
   486  L(01):
   487  	.pred.rel "mutex", p10, p11
   488  		getfsig	pr0_0 = fp0b_0
   489  		ldf8	r_0 = [srp], 8
   490  		xma.l	fp1b_0 = u_0, v1, fp1a_0
   491  	(p10)	add	s0 = pr1_1, acc0, 1
   492  	(p11)	add	s0 = pr1_1, acc0
   493  		xma.hu	fp2a_0 = u_0, v1, fp1a_0
   494  	;;					C 04
   495  	.pred.rel "mutex", p6, p7
   496  	.pred.rel "mutex", p10, p11
   497  		ldf8	u_0 = [up], 8
   498  		getfsig	pr1_3 = fp1b_3
   499  	(p6)	cmp.leu	p8, p9 = acc0, pr0_2
   500  	(p7)	cmp.ltu	p8, p9 = acc0, pr0_2
   501  	(p10)	cmp.leu	p12, p13 = s0, pr1_1
   502  	(p11)	cmp.ltu	p12, p13 = s0, pr1_1
   503  	;;					C 05
   504  	.pred.rel "mutex", p8, p9
   505  		getfsig	acc1_3 = fp2a_3
   506  		st8	[rp] = s0, 8
   507  		xma.l	fp0b_2 = u_2, v0, r_2
   508  	(p8)	add	acc0 = pr0_3, acc1_1, 1
   509  	(p9)	add	acc0 = pr0_3, acc1_1
   510  		xma.hu	fp1a_2 = u_2, v0, r_2
   511  	;;					C 06
   512  L(00):
   513  	.pred.rel "mutex", p12, p13
   514  		getfsig	pr0_1 = fp0b_1
   515  		ldf8	r_1 = [srp], 8
   516  		xma.l	fp1b_1 = u_1, v1, fp1a_1
   517  	(p12)	add	s0 = pr1_2, acc0, 1
   518  	(p13)	add	s0 = pr1_2, acc0
   519  		xma.hu	fp2a_1 = u_1, v1, fp1a_1
   520  	;;					C 07
   521  	.pred.rel "mutex", p8, p9
   522  	.pred.rel "mutex", p12, p13
   523  		ldf8	u_1 = [up], 8
   524  		getfsig	pr1_0 = fp1b_0
   525  	(p8)	cmp.leu	p6, p7 = acc0, pr0_3
   526  	(p9)	cmp.ltu	p6, p7 = acc0, pr0_3
   527  	(p12)	cmp.leu	p10, p11 = s0, pr1_2
   528  	(p13)	cmp.ltu	p10, p11 = s0, pr1_2
   529  	;;					C 08
   530  	.pred.rel "mutex", p6, p7
   531  		getfsig	acc1_0 = fp2a_0
   532  		st8	[rp] = s0, 8
   533  		xma.l	fp0b_3 = u_3, v0, r_3
   534  	(p6)	add	acc0 = pr0_0, acc1_2, 1
   535  	(p7)	add	acc0 = pr0_0, acc1_2
   536  		xma.hu	fp1a_3 = u_3, v0, r_3
   537  	;;					C 09
   538  L(11):
   539  	.pred.rel "mutex", p10, p11
   540  		getfsig	pr0_2 = fp0b_2
   541  		ldf8	r_2 = [srp], 8
   542  		xma.l	fp1b_2 = u_2, v1, fp1a_2
   543  	(p10)	add	s0 = pr1_3, acc0, 1
   544  	(p11)	add	s0 = pr1_3, acc0
   545  		xma.hu	fp2a_2 = u_2, v1, fp1a_2
   546  	;;					C 10
   547  	.pred.rel "mutex", p6, p7
   548  	.pred.rel "mutex", p10, p11
   549  		ldf8	u_2 = [up], 8
   550  		getfsig	pr1_1 = fp1b_1
   551  	(p6)	cmp.leu	p8, p9 = acc0, pr0_0
   552  	(p7)	cmp.ltu	p8, p9 = acc0, pr0_0
   553  	(p10)	cmp.leu	p12, p13 = s0, pr1_3
   554  	(p11)	cmp.ltu	p12, p13 = s0, pr1_3
   555  	;;					C 11
   556  	.pred.rel "mutex", p8, p9
   557  		getfsig	acc1_1 = fp2a_1
   558  		st8	[rp] = s0, 8
   559  		xma.l	fp0b_0 = u_0, v0, r_0
   560  	(p8)	add	acc0 = pr0_1, acc1_3, 1
   561  	(p9)	add	acc0 = pr0_1, acc1_3
   562  		xma.hu	fp1a_0 = u_0, v0, r_0
   563  L(10):		br.cloop.sptk.clr	L(top)	C 12
   564  	;;
   565  C *** MAIN LOOP END ***
   566  L(end):
   567  	.pred.rel "mutex", p12, p13
   568   {.mfi;		getfsig	pr0_3 = fp0b_3
   569  		xma.l	fp1b_3 = u_3, v1, fp1a_3
   570  	(p12)	add	s0 = pr1_0, acc0, 1
   571  }{.mfi;	(p13)	add	s0 = pr1_0, acc0
   572  		xma.hu	fp2a_3 = u_3, v1, fp1a_3
   573  		nop	1
   574  	;;
   575  }	.pred.rel "mutex", p8, p9
   576  	.pred.rel "mutex", p12, p13
   577   {.mmi;		getfsig	pr1_2 = fp1b_2
   578  		st8	[rp] = s0, 8
   579  	(p8)	cmp.leu	p6, p7 = acc0, pr0_1
   580  }{.mmi;	(p9)	cmp.ltu	p6, p7 = acc0, pr0_1
   581  	(p12)	cmp.leu	p10, p11 = s0, pr1_0
   582  	(p13)	cmp.ltu	p10, p11 = s0, pr1_0
   583  	;;
   584  }	.pred.rel "mutex", p6, p7
   585   {.mfi;		getfsig	acc1_2 = fp2a_2
   586  		xma.l	fp0b_1 = u_1, v0, r_1
   587  		nop	1
   588  }{.mmf;	(p6)	add	acc0 = pr0_2, acc1_0, 1
   589  	(p7)	add	acc0 = pr0_2, acc1_0
   590  		xma.hu	fp1a_1 = u_1, v0, r_1
   591  	;;
   592  }
   593  L(cj5):
   594  	.pred.rel "mutex", p10, p11
   595   {.mfi;		getfsig	pr0_0 = fp0b_0
   596  		xma.l	fp1b_0 = u_0, v1, fp1a_0
   597  	(p10)	add	s0 = pr1_1, acc0, 1
   598  }{.mfi;	(p11)	add	s0 = pr1_1, acc0
   599  		xma.hu	fp2a_0 = u_0, v1, fp1a_0
   600  		nop	1
   601  	;;
   602  }	.pred.rel "mutex", p6, p7
   603  	.pred.rel "mutex", p10, p11
   604   {.mmi;		getfsig	pr1_3 = fp1b_3
   605  	st8	[rp] = s0, 8
   606  	(p6)	cmp.leu	p8, p9 = acc0, pr0_2
   607  }{.mmi;	(p7)	cmp.ltu	p8, p9 = acc0, pr0_2
   608  	(p10)	cmp.leu	p12, p13 = s0, pr1_1
   609  	(p11)	cmp.ltu	p12, p13 = s0, pr1_1
   610  	;;
   611  }	.pred.rel "mutex", p8, p9
   612   {.mfi;		getfsig	acc1_3 = fp2a_3
   613  		xma.l	fp0b_2 = u_2, v0, r_2
   614  		nop	1
   615  }{.mmf;	(p8)	add	acc0 = pr0_3, acc1_1, 1
   616  	(p9)	add	acc0 = pr0_3, acc1_1
   617  		xma.hu	fp1a_2 = u_2, v0, r_2
   618  	;;
   619  }
   620  L(cj4):
   621  	.pred.rel "mutex", p12, p13
   622   {.mfi;		getfsig	pr0_1 = fp0b_1
   623  		xma.l	fp1b_1 = u_1, v1, fp1a_1
   624  	(p12)	add	s0 = pr1_2, acc0, 1
   625  }{.mfi;	(p13)	add	s0 = pr1_2, acc0
   626  		xma.hu	fp2a_1 = u_1, v1, fp1a_1
   627  		nop	1
   628  	;;
   629  }	.pred.rel "mutex", p8, p9
   630  	.pred.rel "mutex", p12, p13
   631   {.mmi;		getfsig	pr1_0 = fp1b_0
   632  		st8	[rp] = s0, 8
   633  	(p8)	cmp.leu	p6, p7 = acc0, pr0_3
   634  }{.mmi;	(p9)	cmp.ltu	p6, p7 = acc0, pr0_3
   635  	(p12)	cmp.leu	p10, p11 = s0, pr1_2
   636  	(p13)	cmp.ltu	p10, p11 = s0, pr1_2
   637  	;;
   638  }	.pred.rel "mutex", p6, p7
   639   {.mmi;		getfsig	acc1_0 = fp2a_0
   640  	(p6)	add	acc0 = pr0_0, acc1_2, 1
   641  	(p7)	add	acc0 = pr0_0, acc1_2
   642  	;;
   643  }
   644  L(cj3):
   645  	.pred.rel "mutex", p10, p11
   646   {.mfi;		getfsig	pr0_2 = fp0b_2
   647  		xma.l	fp1b_2 = u_2, v1, fp1a_2
   648  	(p10)	add	s0 = pr1_3, acc0, 1
   649  }{.mfi;	(p11)	add	s0 = pr1_3, acc0
   650  		xma.hu	fp2a_2 = u_2, v1, fp1a_2
   651  		nop	1
   652  	;;
   653  }	.pred.rel "mutex", p6, p7
   654  	.pred.rel "mutex", p10, p11
   655   {.mmi;		getfsig	pr1_1 = fp1b_1
   656  		st8	[rp] = s0, 8
   657  	(p6)	cmp.leu	p8, p9 = acc0, pr0_0
   658  }{.mmi;	(p7)	cmp.ltu	p8, p9 = acc0, pr0_0
   659  	(p10)	cmp.leu	p12, p13 = s0, pr1_3
   660  	(p11)	cmp.ltu	p12, p13 = s0, pr1_3
   661  	;;
   662  }	.pred.rel "mutex", p8, p9
   663   {.mmi;		getfsig	acc1_1 = fp2a_1
   664  	(p8)	add	acc0 = pr0_1, acc1_3, 1
   665  	(p9)	add	acc0 = pr0_1, acc1_3
   666  	;;
   667  }	.pred.rel "mutex", p12, p13
   668   {.mmi;	(p12)	add	s0 = pr1_0, acc0, 1
   669  	(p13)	add	s0 = pr1_0, acc0
   670  		nop	1
   671  	;;
   672  }	.pred.rel "mutex", p8, p9
   673  	.pred.rel "mutex", p12, p13
   674   {.mmi;		getfsig	pr1_2 = fp1b_2
   675  		st8	[rp] = s0, 8
   676  	(p8)	cmp.leu	p6, p7 = acc0, pr0_1
   677  }{.mmi;	(p9)	cmp.ltu	p6, p7 = acc0, pr0_1
   678  	(p12)	cmp.leu	p10, p11 = s0, pr1_0
   679  	(p13)	cmp.ltu	p10, p11 = s0, pr1_0
   680  	;;
   681  }	.pred.rel "mutex", p6, p7
   682   {.mmi;		getfsig	r8 = fp2a_2
   683  	(p6)	add	acc0 = pr0_2, acc1_0, 1
   684  	(p7)	add	acc0 = pr0_2, acc1_0
   685  	;;
   686  }	.pred.rel "mutex", p10, p11
   687   {.mmi;	(p10)	add	s0 = pr1_1, acc0, 1
   688  	(p11)	add	s0 = pr1_1, acc0
   689  	(p6)	cmp.leu	p8, p9 = acc0, pr0_2
   690  	;;
   691  }	.pred.rel "mutex", p10, p11
   692   {.mmi;	(p7)	cmp.ltu	p8, p9 = acc0, pr0_2
   693  	(p10)	cmp.leu	p12, p13 = s0, pr1_1
   694  	(p11)	cmp.ltu	p12, p13 = s0, pr1_1
   695  	;;
   696  }	.pred.rel "mutex", p8, p9
   697   {.mmi;		st8	[rp] = s0, 8
   698  	(p8)	add	acc0 = pr1_2, acc1_1, 1
   699  	(p9)	add	acc0 = pr1_2, acc1_1
   700  	;;
   701  }	.pred.rel "mutex", p8, p9
   702   {.mmi;	(p8)	cmp.leu	p10, p11 = acc0, pr1_2
   703  	(p9)	cmp.ltu	p10, p11 = acc0, pr1_2
   704  	(p12)	add	acc0 = 1, acc0
   705  	;;
   706  }{.mmi;		st8	[rp] = acc0, 8
   707  	(p12)	cmpeqor	p10, p0 = 0, acc0
   708  		nop	1
   709  	;;
   710  }{.mib;	(p10)	add	r8 = 1, r8
   711  		mov	ar.lc = r2
   712  		br.ret.sptk.many b0
   713  }
   714  EPILOGUE()
   715  ASM_END()