github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/ia64/mul_2.asm (about)

     1  dnl  IA-64 mpn_mul_2 -- Multiply a n-limb number with a 2-limb number and store
     2  dnl  store the result to a (n+1)-limb number.
     3  
     4  dnl  Contributed to the GNU project by Torbjorn Granlund.
     5  
     6  dnl  Copyright 2004, 2011 Free Software Foundation, Inc.
     7  
     8  dnl  This file is part of the GNU MP Library.
     9  dnl
    10  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    11  dnl  it under the terms of either:
    12  dnl
    13  dnl    * the GNU Lesser General Public License as published by the Free
    14  dnl      Software Foundation; either version 3 of the License, or (at your
    15  dnl      option) any later version.
    16  dnl
    17  dnl  or
    18  dnl
    19  dnl    * the GNU General Public License as published by the Free Software
    20  dnl      Foundation; either version 2 of the License, or (at your option) any
    21  dnl      later version.
    22  dnl
    23  dnl  or both in parallel, as here.
    24  dnl
    25  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    26  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    27  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    28  dnl  for more details.
    29  dnl
    30  dnl  You should have received copies of the GNU General Public License and the
    31  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    32  dnl  see https://www.gnu.org/licenses/.
    33  
    34  include(`../config.m4')
    35  
    36  C         cycles/limb
    37  C Itanium:    ?
    38  C Itanium 2:  1.5
    39  
    40  C TODO
    41  C  * Clean up variable names, and try to decrease the number of distinct
    42  C    registers used.
    43  C  * Clean up feed-in code to not require zeroing several registers.
    44  C  * Make sure we don't depend on uninitialized predicate registers.
    45  C  * Could perhaps save a few cycles by using 1 c/l carry propagation in
    46  C    wind-down code.
    47  C  * Ultimately rewrite.  The problem with this code is that it first uses a
    48  C    loaded u value in one xma pair, then leaves it live over several unrelated
    49  C    xma pairs, before it uses it again.  It should actually be quite possible
    50  C    to just swap some aligned xma pairs around.  But we should then schedule
    51  C    u loads further from the first use.
    52  
    53  C INPUT PARAMETERS
    54  define(`rp',`r32')
    55  define(`up',`r33')
    56  define(`n',`r34')
    57  define(`vp',`r35')
    58  
    59  define(`srp',`r3')
    60  
    61  define(`v0',`f6')
    62  define(`v1',`f7')
    63  
    64  define(`s0',`r14')
    65  define(`acc0',`r15')
    66  
    67  define(`pr0_0',`r16') define(`pr0_1',`r17')
    68  define(`pr0_2',`r18') define(`pr0_3',`r19')
    69  
    70  define(`pr1_0',`r20') define(`pr1_1',`r21')
    71  define(`pr1_2',`r22') define(`pr1_3',`r23')
    72  
    73  define(`acc1_0',`r24') define(`acc1_1',`r25')
    74  define(`acc1_2',`r26') define(`acc1_3',`r27')
    75  
    76  dnl define(`',`r28')
    77  dnl define(`',`r29')
    78  dnl define(`',`r30')
    79  dnl define(`',`r31')
    80  
    81  define(`fp0b_0',`f8') define(`fp0b_1',`f9')
    82  define(`fp0b_2',`f10') define(`fp0b_3',`f11')
    83  
    84  define(`fp1a_0',`f12') define(`fp1a_1',`f13')
    85  define(`fp1a_2',`f14') define(`fp1a_3',`f15')
    86  
    87  define(`fp1b_0',`f32') define(`fp1b_1',`f33')
    88  define(`fp1b_2',`f34') define(`fp1b_3',`f35')
    89  
    90  define(`fp2a_0',`f36') define(`fp2a_1',`f37')
    91  define(`fp2a_2',`f38') define(`fp2a_3',`f39')
    92  
    93  define(`u_0',`f44') define(`u_1',`f45')
    94  define(`u_2',`f46') define(`u_3',`f47')
    95  
    96  define(`ux',`f49')
    97  define(`uy',`f51')
    98  
    99  ASM_START()
   100  PROLOGUE(mpn_mul_2)
   101  	.prologue
   102  	.save	ar.lc, r2
   103  	.body
   104  
   105  ifdef(`HAVE_ABI_32',`
   106   {.mmi;		addp4	rp = 0, rp		C			M I
   107  		addp4	up = 0, up		C			M I
   108  		addp4	vp = 0, vp		C			M I
   109  }{.mmi;		nop	1
   110  		nop	1
   111  		zxt4	n = n			C			I
   112  	;;
   113  }')
   114  
   115   {.mmi;		ldf8	ux = [up], 8		C			M
   116  		ldf8	v0 = [vp], 8		C			M
   117  		mov	r2 = ar.lc		C			I0
   118  }{.mmi;		nop	1			C			M
   119  		and	r14 = 3, n		C			M I
   120  		add	n = -2, n		C			M I
   121  	;;
   122  }{.mmi;		ldf8	uy = [up], 8		C			M
   123  		ldf8	v1 = [vp]		C			M
   124  		shr.u	n = n, 2		C			I0
   125  }{.mmi;		nop	1			C			M
   126  		cmp.eq	p10, p0 = 1, r14	C			M I
   127  		cmp.eq	p11, p0 = 2, r14	C			M I
   128  	;;
   129  }{.mmi;		nop	1			C			M
   130  		cmp.eq	p12, p0 = 3, r14	C			M I
   131  		mov	ar.lc = n		C			I0
   132  }{.bbb;	(p10)	br.dptk	L(b01)			C			B
   133  	(p11)	br.dptk	L(b10)			C			B
   134  	(p12)	br.dptk	L(b11)			C			B
   135  	;;
   136  }
   137  	ALIGN(32)
   138  L(b00):		ldf8	u_1 = [up], 8
   139  		mov	acc1_2 = 0
   140  		mov	pr1_2 = 0
   141  		mov	pr0_3 = 0
   142  		cmp.ne	p8, p9 = r0, r0
   143  	;;
   144  		xma.l	fp0b_3 = ux, v0, f0
   145  		cmp.ne	p12, p13 = r0, r0
   146  		ldf8	u_2 = [up], 8
   147  		xma.hu	fp1a_3 = ux, v0, f0
   148  		br.cloop.dptk	L(gt4)
   149  
   150  		xma.l	fp0b_0 = uy, v0, f0
   151  		xma.hu	fp1a_0 = uy, v0, f0
   152  	;;
   153  		getfsig	acc0 = fp0b_3
   154  		xma.l	fp1b_3 = ux, v1, fp1a_3
   155  		xma.hu	fp2a_3 = ux, v1, fp1a_3
   156  	;;
   157  		xma.l	fp0b_1 = u_1, v0, f0
   158  		xma.hu	fp1a_1 = u_1, v0, f0
   159  	;;
   160  		getfsig	pr0_0 = fp0b_0
   161  		xma.l	fp1b_0 = uy, v1, fp1a_0
   162  		xma.hu	fp2a_0 = uy, v1, fp1a_0
   163  	;;
   164  		getfsig	pr1_3 = fp1b_3
   165  		getfsig	acc1_3 = fp2a_3
   166  		xma.l	fp0b_2 = u_2, v0, f0
   167  		xma.hu	fp1a_2 = u_2, v0, f0
   168  		br	L(cj4)
   169  
   170  L(gt4):		xma.l	fp0b_0 = uy, v0, f0
   171  		xma.hu	fp1a_0 = uy, v0, f0
   172  	;;
   173  		getfsig	acc0 = fp0b_3
   174  		xma.l	fp1b_3 = ux, v1, fp1a_3
   175  		ldf8	u_3 = [up], 8
   176  		xma.hu	fp2a_3 = ux, v1, fp1a_3
   177  	;;
   178  		xma.l	fp0b_1 = u_1, v0, f0
   179  		xma.hu	fp1a_1 = u_1, v0, f0
   180  	;;
   181  		getfsig	pr0_0 = fp0b_0
   182  		xma.l	fp1b_0 = uy, v1, fp1a_0
   183  		xma.hu	fp2a_0 = uy, v1, fp1a_0
   184  	;;
   185  		ldf8	u_0 = [up], 8
   186  		getfsig	pr1_3 = fp1b_3
   187  		xma.l	fp0b_2 = u_2, v0, f0
   188  	;;
   189  		getfsig	acc1_3 = fp2a_3
   190  		xma.hu	fp1a_2 = u_2, v0, f0
   191  		br	L(00)
   192  
   193  
   194  	ALIGN(32)
   195  L(b01):		ldf8	u_0 = [up], 8		C M
   196  		mov	acc1_1 = 0		C M I
   197  		mov	pr1_1 = 0		C M I
   198  		mov	pr0_2 = 0		C M I
   199  		cmp.ne	p6, p7 = r0, r0		C M I
   200  	;;
   201  		xma.l	fp0b_2 = ux, v0, f0	C F
   202  		cmp.ne	p10, p11 = r0, r0	C M I
   203  		ldf8	u_1 = [up], 8		C M
   204  		xma.hu	fp1a_2 = ux, v0, f0	C F
   205  	;;
   206  		xma.l	fp0b_3 = uy, v0, f0	C F
   207  		xma.hu	fp1a_3 = uy, v0, f0	C F
   208  	;;
   209  		getfsig	acc0 = fp0b_2		C M
   210  		xma.l	fp1b_2 = ux, v1,fp1a_2	C F
   211  		ldf8	u_2 = [up], 8		C M
   212  		xma.hu	fp2a_2 = ux, v1,fp1a_2	C F
   213  		br.cloop.dptk	L(gt5)
   214  
   215  		xma.l	fp0b_0 = u_0, v0, f0	C F
   216  		xma.hu	fp1a_0 = u_0, v0, f0	C F
   217  	;;
   218  		getfsig	pr0_3 = fp0b_3		C M
   219  		xma.l	fp1b_3 = uy, v1,fp1a_3	C F
   220  		xma.hu	fp2a_3 = uy, v1,fp1a_3	C F
   221  	;;
   222  		getfsig	pr1_2 = fp1b_2		C M
   223  		getfsig	acc1_2 = fp2a_2		C M
   224  		xma.l	fp0b_1 = u_1, v0, f0	C F
   225  		xma.hu	fp1a_1 = u_1, v0, f0	C F
   226  		br	L(cj5)
   227  
   228  L(gt5):		xma.l	fp0b_0 = u_0, v0, f0
   229  		xma.hu	fp1a_0 = u_0, v0, f0
   230  	;;
   231  		getfsig	pr0_3 = fp0b_3
   232  		xma.l	fp1b_3 = uy, v1, fp1a_3
   233  		xma.hu	fp2a_3 = uy, v1, fp1a_3
   234  	;;
   235  		ldf8	u_3 = [up], 8
   236  		getfsig	pr1_2 = fp1b_2
   237  		xma.l	fp0b_1 = u_1, v0, f0
   238  	;;
   239  		getfsig	acc1_2 = fp2a_2
   240  		xma.hu	fp1a_1 = u_1, v0, f0
   241  		br	L(01)
   242  
   243  
   244  	ALIGN(32)
   245  L(b10):		br.cloop.dptk	L(gt2)
   246  		xma.l	fp0b_1 = ux, v0, f0
   247  		xma.hu	fp1a_1 = ux, v0, f0
   248  	;;
   249  		xma.l	fp0b_2 = uy, v0, f0
   250  		xma.hu	fp1a_2 = uy, v0, f0
   251  	;;
   252  		stf8	[rp] = fp0b_1, 8
   253  		xma.l	fp1b_1 = ux, v1, fp1a_1
   254  		xma.hu	fp2a_1 = ux, v1, fp1a_1
   255  	;;
   256  		getfsig	acc0 = fp0b_2
   257  		xma.l	fp1b_2 = uy, v1, fp1a_2
   258  		xma.hu	fp2a_2 = uy, v1, fp1a_2
   259  	;;
   260  		getfsig	pr1_1 = fp1b_1
   261  		getfsig	acc1_1 = fp2a_1
   262  		mov	ar.lc = r2
   263  		getfsig	pr1_2 = fp1b_2
   264  		getfsig	r8 = fp2a_2
   265  	;;
   266  		add	s0 = pr1_1, acc0
   267  	;;
   268  		st8	[rp] = s0, 8
   269  		cmp.ltu	p8, p9 = s0, pr1_1
   270  		sub	r31 = -1, acc1_1
   271  	;;
   272  	.pred.rel "mutex", p8, p9
   273  	(p8)	add	acc0 = pr1_2, acc1_1, 1
   274  	(p9)	add	acc0 = pr1_2, acc1_1
   275  	(p8)	cmp.leu	p10, p0 = r31, pr1_2
   276  	(p9)	cmp.ltu	p10, p0 = r31, pr1_2
   277  	;;
   278  		st8	[rp] = acc0, 8
   279  	(p10)	add	r8 = 1, r8
   280  		br.ret.sptk.many b0
   281  
   282  L(gt2):		ldf8	u_3 = [up], 8
   283  		mov	acc1_0 = 0
   284  		mov	pr1_0 = 0
   285  	;;
   286  		mov	pr0_1 = 0
   287  		xma.l	fp0b_1 = ux, v0, f0
   288  		ldf8	u_0 = [up], 8
   289  		xma.hu	fp1a_1 = ux, v0, f0
   290  	;;
   291  		xma.l	fp0b_2 = uy, v0, f0
   292  		xma.hu	fp1a_2 = uy, v0, f0
   293  	;;
   294  		getfsig	acc0 = fp0b_1
   295  		xma.l	fp1b_1 = ux, v1, fp1a_1
   296  		xma.hu	fp2a_1 = ux, v1, fp1a_1
   297  	;;
   298  		ldf8	u_1 = [up], 8
   299  		xma.l	fp0b_3 = u_3, v0, f0
   300  		xma.hu	fp1a_3 = u_3, v0, f0
   301  	;;
   302  		getfsig	pr0_2 = fp0b_2
   303  		xma.l	fp1b_2 = uy, v1, fp1a_2
   304  		xma.hu	fp2a_2 = uy, v1, fp1a_2
   305  	;;
   306  		ldf8	u_2 = [up], 8
   307  		getfsig	pr1_1 = fp1b_1
   308  	;;
   309   {.mfi;		getfsig	acc1_1 = fp2a_1
   310  		xma.l	fp0b_0 = u_0, v0, f0
   311  		cmp.ne	p8, p9 = r0, r0
   312  }{.mfb;		cmp.ne	p12, p13 = r0, r0
   313  		xma.hu	fp1a_0 = u_0, v0, f0
   314  		br	L(10)
   315  }
   316  
   317  	ALIGN(32)
   318  L(b11):		mov	acc1_3 = 0
   319  		mov	pr1_3 = 0
   320  		mov	pr0_0 = 0
   321  		ldf8	u_2 = [up], 8
   322  		cmp.ne	p6, p7 = r0, r0
   323  		br.cloop.dptk	L(gt3)
   324  	;;
   325  		xma.l	fp0b_0 = ux, v0, f0
   326  		xma.hu	fp1a_0 = ux, v0, f0
   327  	;;
   328  		cmp.ne	p10, p11 = r0, r0
   329  		xma.l	fp0b_1 = uy, v0, f0
   330  		xma.hu	fp1a_1 = uy, v0, f0
   331  	;;
   332  		getfsig	acc0 = fp0b_0
   333  		xma.l	fp1b_0 = ux, v1, fp1a_0
   334  		xma.hu	fp2a_0 = ux, v1, fp1a_0
   335  	;;
   336  		xma.l	fp0b_2 = u_2, v0, f0
   337  		xma.hu	fp1a_2 = u_2, v0, f0
   338  	;;
   339  		getfsig	pr0_1 = fp0b_1
   340  		xma.l	fp1b_1 = uy, v1, fp1a_1
   341  		xma.hu	fp2a_1 = uy, v1, fp1a_1
   342  	;;
   343  		getfsig	pr1_0 = fp1b_0
   344  		getfsig	acc1_0 = fp2a_0
   345  		br	L(cj3)
   346  
   347  L(gt3):		xma.l	fp0b_0 = ux, v0, f0
   348  		cmp.ne	p10, p11 = r0, r0
   349  		ldf8	u_3 = [up], 8
   350  		xma.hu	fp1a_0 = ux, v0, f0
   351  	;;
   352  		xma.l	fp0b_1 = uy, v0, f0
   353  		xma.hu	fp1a_1 = uy, v0, f0
   354  	;;
   355  		getfsig	acc0 = fp0b_0
   356  		xma.l	fp1b_0 = ux, v1, fp1a_0
   357  		ldf8	u_0 = [up], 8
   358  		xma.hu	fp2a_0 = ux, v1, fp1a_0
   359  	;;
   360  		xma.l	fp0b_2 = u_2, v0, f0
   361  		xma.hu	fp1a_2 = u_2, v0, f0
   362  	;;
   363  		getfsig	pr0_1 = fp0b_1
   364  		xma.l	fp1b_1 = uy, v1, fp1a_1
   365  		xma.hu	fp2a_1 = uy, v1, fp1a_1
   366  	;;
   367  		ldf8	u_1 = [up], 8
   368  		getfsig	pr1_0 = fp1b_0
   369  	;;
   370  		getfsig	acc1_0 = fp2a_0
   371  		xma.l	fp0b_3 = u_3, v0, f0
   372  		xma.hu	fp1a_3 = u_3, v0, f0
   373  		br	L(11)
   374  
   375  
   376  C *** MAIN LOOP START ***
   377  	ALIGN(32)
   378  L(top):						C 00
   379  	.pred.rel "mutex", p8, p9
   380  	.pred.rel "mutex", p12, p13
   381  		ldf8	u_3 = [up], 8
   382  		getfsig	pr1_2 = fp1b_2
   383  	(p8)	cmp.leu	p6, p7 = acc0, pr0_1
   384  	(p9)	cmp.ltu	p6, p7 = acc0, pr0_1
   385  	(p12)	cmp.leu	p10, p11 = s0, pr1_0
   386  	(p13)	cmp.ltu	p10, p11 = s0, pr1_0
   387  	;;					C 01
   388  	.pred.rel "mutex", p6, p7
   389  		getfsig	acc1_2 = fp2a_2
   390  		st8	[rp] = s0, 8
   391  		xma.l	fp0b_1 = u_1, v0, f0
   392  	(p6)	add	acc0 = pr0_2, acc1_0, 1
   393  	(p7)	add	acc0 = pr0_2, acc1_0
   394  		xma.hu	fp1a_1 = u_1, v0, f0
   395  	;;					C 02
   396  L(01):
   397  	.pred.rel "mutex", p10, p11
   398  		getfsig	pr0_0 = fp0b_0
   399  		xma.l	fp1b_0 = u_0, v1, fp1a_0
   400  	(p10)	add	s0 = pr1_1, acc0, 1
   401  	(p11)	add	s0 = pr1_1, acc0
   402  		xma.hu	fp2a_0 = u_0, v1, fp1a_0
   403  		nop	1
   404  	;;					C 03
   405  	.pred.rel "mutex", p6, p7
   406  	.pred.rel "mutex", p10, p11
   407  		ldf8	u_0 = [up], 8
   408  		getfsig	pr1_3 = fp1b_3
   409  	(p6)	cmp.leu	p8, p9 = acc0, pr0_2
   410  	(p7)	cmp.ltu	p8, p9 = acc0, pr0_2
   411  	(p10)	cmp.leu	p12, p13 = s0, pr1_1
   412  	(p11)	cmp.ltu	p12, p13 = s0, pr1_1
   413  	;;					C 04
   414  	.pred.rel "mutex", p8, p9
   415  		getfsig	acc1_3 = fp2a_3
   416  		st8	[rp] = s0, 8
   417  		xma.l	fp0b_2 = u_2, v0, f0
   418  	(p8)	add	acc0 = pr0_3, acc1_1, 1
   419  	(p9)	add	acc0 = pr0_3, acc1_1
   420  		xma.hu	fp1a_2 = u_2, v0, f0
   421  	;;					C 05
   422  L(00):
   423  	.pred.rel "mutex", p12, p13
   424  		getfsig	pr0_1 = fp0b_1
   425  		xma.l	fp1b_1 = u_1, v1, fp1a_1
   426  	(p12)	add	s0 = pr1_2, acc0, 1
   427  	(p13)	add	s0 = pr1_2, acc0
   428  		xma.hu	fp2a_1 = u_1, v1, fp1a_1
   429  		nop	1
   430  	;;					C 06
   431  	.pred.rel "mutex", p8, p9
   432  	.pred.rel "mutex", p12, p13
   433  		ldf8	u_1 = [up], 8
   434  		getfsig	pr1_0 = fp1b_0
   435  	(p8)	cmp.leu	p6, p7 = acc0, pr0_3
   436  	(p9)	cmp.ltu	p6, p7 = acc0, pr0_3
   437  	(p12)	cmp.leu	p10, p11 = s0, pr1_2
   438  	(p13)	cmp.ltu	p10, p11 = s0, pr1_2
   439  	;;					C 07
   440  	.pred.rel "mutex", p6, p7
   441  		getfsig	acc1_0 = fp2a_0
   442  		st8	[rp] = s0, 8
   443  		xma.l	fp0b_3 = u_3, v0, f0
   444  	(p6)	add	acc0 = pr0_0, acc1_2, 1
   445  	(p7)	add	acc0 = pr0_0, acc1_2
   446  		xma.hu	fp1a_3 = u_3, v0, f0
   447  	;;					C 08
   448  L(11):
   449  	.pred.rel "mutex", p10, p11
   450  		getfsig	pr0_2 = fp0b_2
   451  		xma.l	fp1b_2 = u_2, v1, fp1a_2
   452  	(p10)	add	s0 = pr1_3, acc0, 1
   453  	(p11)	add	s0 = pr1_3, acc0
   454  		xma.hu	fp2a_2 = u_2, v1, fp1a_2
   455  		nop	1
   456  	;;					C 09
   457  	.pred.rel "mutex", p6, p7
   458  	.pred.rel "mutex", p10, p11
   459  		ldf8	u_2 = [up], 8
   460  		getfsig	pr1_1 = fp1b_1
   461  	(p6)	cmp.leu	p8, p9 = acc0, pr0_0
   462  	(p7)	cmp.ltu	p8, p9 = acc0, pr0_0
   463  	(p10)	cmp.leu	p12, p13 = s0, pr1_3
   464  	(p11)	cmp.ltu	p12, p13 = s0, pr1_3
   465  	;;					C 10
   466  	.pred.rel "mutex", p8, p9
   467  		getfsig	acc1_1 = fp2a_1
   468  		st8	[rp] = s0, 8
   469  		xma.l	fp0b_0 = u_0, v0, f0
   470  	(p8)	add	acc0 = pr0_1, acc1_3, 1
   471  	(p9)	add	acc0 = pr0_1, acc1_3
   472  		xma.hu	fp1a_0 = u_0, v0, f0
   473  	;;					C 11
   474  L(10):
   475  	.pred.rel "mutex", p12, p13
   476  		getfsig	pr0_3 = fp0b_3
   477  		xma.l	fp1b_3 = u_3, v1, fp1a_3
   478  	(p12)	add	s0 = pr1_0, acc0, 1
   479  	(p13)	add	s0 = pr1_0, acc0
   480  		xma.hu	fp2a_3 = u_3, v1, fp1a_3
   481  		br.cloop.dptk	L(top)
   482  	;;
   483  C *** MAIN LOOP END ***
   484  
   485  	.pred.rel "mutex", p8, p9
   486  	.pred.rel "mutex", p12, p13
   487   {.mmi;		getfsig	pr1_2 = fp1b_2
   488  		st8	[rp] = s0, 8
   489  	(p8)	cmp.leu	p6, p7 = acc0, pr0_1
   490  }{.mmi;	(p9)	cmp.ltu	p6, p7 = acc0, pr0_1
   491  	(p12)	cmp.leu	p10, p11 = s0, pr1_0
   492  	(p13)	cmp.ltu	p10, p11 = s0, pr1_0
   493  	;;
   494  }	.pred.rel "mutex", p6, p7
   495   {.mfi;		getfsig	acc1_2 = fp2a_2
   496  		xma.l	fp0b_1 = u_1, v0, f0
   497  		nop	1
   498  }{.mmf;	(p6)	add	acc0 = pr0_2, acc1_0, 1
   499  	(p7)	add	acc0 = pr0_2, acc1_0
   500  		xma.hu	fp1a_1 = u_1, v0, f0
   501  	;;
   502  }
   503  L(cj5):
   504  	.pred.rel "mutex", p10, p11
   505   {.mfi;		getfsig	pr0_0 = fp0b_0
   506  		xma.l	fp1b_0 = u_0, v1, fp1a_0
   507  	(p10)	add	s0 = pr1_1, acc0, 1
   508  }{.mfi;	(p11)	add	s0 = pr1_1, acc0
   509  		xma.hu	fp2a_0 = u_0, v1, fp1a_0
   510  		nop	1
   511  	;;
   512  }	.pred.rel "mutex", p6, p7
   513  	.pred.rel "mutex", p10, p11
   514   {.mmi;		getfsig	pr1_3 = fp1b_3
   515  		st8	[rp] = s0, 8
   516  	(p6)	cmp.leu	p8, p9 = acc0, pr0_2
   517  }{.mmi;	(p7)	cmp.ltu	p8, p9 = acc0, pr0_2
   518  	(p10)	cmp.leu	p12, p13 = s0, pr1_1
   519  	(p11)	cmp.ltu	p12, p13 = s0, pr1_1
   520  	;;
   521  }	.pred.rel "mutex", p8, p9
   522   {.mfi;		getfsig	acc1_3 = fp2a_3
   523  		xma.l	fp0b_2 = u_2, v0, f0
   524  		nop	1
   525  }{.mmf;	(p8)	add	acc0 = pr0_3, acc1_1, 1
   526  	(p9)	add	acc0 = pr0_3, acc1_1
   527  		xma.hu	fp1a_2 = u_2, v0, f0
   528  	;;
   529  }
   530  L(cj4):
   531  	.pred.rel "mutex", p12, p13
   532   {.mfi;		getfsig	pr0_1 = fp0b_1
   533  		xma.l	fp1b_1 = u_1, v1, fp1a_1
   534  	(p12)	add	s0 = pr1_2, acc0, 1
   535  }{.mfi;	(p13)	add	s0 = pr1_2, acc0
   536  		xma.hu	fp2a_1 = u_1, v1, fp1a_1
   537  		nop	1
   538  	;;
   539  }	.pred.rel "mutex", p8, p9
   540  	.pred.rel "mutex", p12, p13
   541   {.mmi;		getfsig	pr1_0 = fp1b_0
   542  		st8	[rp] = s0, 8
   543  	(p8)	cmp.leu	p6, p7 = acc0, pr0_3
   544  }{.mmi;	(p9)	cmp.ltu	p6, p7 = acc0, pr0_3
   545  	(p12)	cmp.leu	p10, p11 = s0, pr1_2
   546  	(p13)	cmp.ltu	p10, p11 = s0, pr1_2
   547  	;;
   548  }	.pred.rel "mutex", p6, p7
   549   {.mmi;		getfsig	acc1_0 = fp2a_0
   550  	(p6)	add	acc0 = pr0_0, acc1_2, 1
   551  	(p7)	add	acc0 = pr0_0, acc1_2
   552  	;;
   553  }
   554  L(cj3):
   555  	.pred.rel "mutex", p10, p11
   556   {.mfi;		getfsig	pr0_2 = fp0b_2
   557  		xma.l	fp1b_2 = u_2, v1, fp1a_2
   558  	(p10)	add	s0 = pr1_3, acc0, 1
   559  }{.mfi;	(p11)	add	s0 = pr1_3, acc0
   560  		xma.hu	fp2a_2 = u_2, v1, fp1a_2
   561  		nop	1
   562  	;;
   563  }	.pred.rel "mutex", p6, p7
   564  	.pred.rel "mutex", p10, p11
   565   {.mmi;		getfsig	pr1_1 = fp1b_1
   566  		st8	[rp] = s0, 8
   567  	(p6)	cmp.leu	p8, p9 = acc0, pr0_0
   568  }{.mmi;	(p7)	cmp.ltu	p8, p9 = acc0, pr0_0
   569  	(p10)	cmp.leu	p12, p13 = s0, pr1_3
   570  	(p11)	cmp.ltu	p12, p13 = s0, pr1_3
   571  	;;
   572  }	.pred.rel "mutex", p8, p9
   573   {.mmi;		getfsig	acc1_1 = fp2a_1
   574  	(p8)	add	acc0 = pr0_1, acc1_3, 1
   575  	(p9)	add	acc0 = pr0_1, acc1_3
   576  	;;
   577  }	.pred.rel "mutex", p12, p13
   578   {.mmi;	(p12)	add	s0 = pr1_0, acc0, 1
   579  	(p13)	add	s0 = pr1_0, acc0
   580  		nop	1
   581  	;;
   582  }	.pred.rel "mutex", p8, p9
   583  	.pred.rel "mutex", p12, p13
   584   {.mmi;		getfsig	pr1_2 = fp1b_2
   585  		st8	[rp] = s0, 8
   586  	(p8)	cmp.leu	p6, p7 = acc0, pr0_1
   587  }{.mmi;	(p9)	cmp.ltu	p6, p7 = acc0, pr0_1
   588  	(p12)	cmp.leu	p10, p11 = s0, pr1_0
   589  	(p13)	cmp.ltu	p10, p11 = s0, pr1_0
   590  	;;
   591  }	.pred.rel "mutex", p6, p7
   592   {.mmi;		getfsig	r8 = fp2a_2
   593  	(p6)	add	acc0 = pr0_2, acc1_0, 1
   594  	(p7)	add	acc0 = pr0_2, acc1_0
   595  	;;
   596  }	.pred.rel "mutex", p10, p11
   597   {.mmi;	(p10)	add	s0 = pr1_1, acc0, 1
   598  	(p11)	add	s0 = pr1_1, acc0
   599  	(p6)	cmp.leu	p8, p9 = acc0, pr0_2
   600  	;;
   601  }	.pred.rel "mutex", p10, p11
   602   {.mmi;	(p7)	cmp.ltu	p8, p9 = acc0, pr0_2
   603  	(p10)	cmp.leu	p12, p13 = s0, pr1_1
   604  	(p11)	cmp.ltu	p12, p13 = s0, pr1_1
   605  	;;
   606  }	.pred.rel "mutex", p8, p9
   607   {.mmi;		st8	[rp] = s0, 8
   608  	(p8)	add	acc0 = pr1_2, acc1_1, 1
   609  	(p9)	add	acc0 = pr1_2, acc1_1
   610  	;;
   611  }	.pred.rel "mutex", p8, p9
   612   {.mmi;	(p8)	cmp.leu	p10, p11 = acc0, pr1_2
   613  	(p9)	cmp.ltu	p10, p11 = acc0, pr1_2
   614  	(p12)	add	acc0 = 1, acc0
   615  	;;
   616  }{.mmi;		st8	[rp] = acc0, 8
   617  	(p12)	cmpeqor	p10, p0 = 0, acc0
   618  		nop	1
   619  	;;
   620  }{.mib;	(p10)	add	r8 = 1, r8
   621  		mov	ar.lc = r2
   622  		br.ret.sptk.many b0
   623  }
   624  EPILOGUE()
   625  ASM_END()