github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/ia64/bdiv_dbm1c.asm (about)

     1  dnl  IA-64 mpn_bdiv_dbm1.
     2  
     3  dnl  Contributed to the GNU project by Torbjorn Granlund.
     4  
     5  dnl  Copyright 2008, 2009 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  C         cycles/limb
    36  C Itanium:    4
    37  C Itanium 2:  2
    38  
    39  C TODO
    40  C  * Optimize feed-in and wind-down code, both for speed and code size.
    41  
    42  C INPUT PARAMETERS
    43  define(`rp', `r32')
    44  define(`up', `r33')
    45  define(`n', `r34')
    46  define(`bd', `r35')
    47  
    48  ASM_START()
    49  PROLOGUE(mpn_bdiv_dbm1c)
    50  	.prologue
    51  	.save		ar.lc, r2
    52  	.body
    53  
    54  ifdef(`HAVE_ABI_32',
    55  `	addp4		rp = 0, rp		C M I
    56  	addp4		up = 0, up		C M I
    57  	zxt4		n = n			C I
    58  	;;
    59  ')
    60  {.mmb
    61  	mov		r15 = r36		C M I
    62  	ldf8		f9 = [up], 8		C M
    63  	nop.b		0			C B
    64  }
    65  .Lcommon:
    66  {.mii
    67  	adds		r16 = -1, n		C M I
    68  	mov		r2 = ar.lc		C I0
    69  	and		r14 = 3, n		C M I
    70  	;;
    71  }
    72  {.mii
    73  	setf.sig	f6 = bd			C M2 M3
    74  	shr.u		r31 = r16, 2		C I0
    75  	cmp.eq		p10, p0 = 0, r14	C M I
    76  }
    77  {.mii
    78  	nop.m		0			C M
    79  	cmp.eq		p11, p0 = 2, r14	C M I
    80  	cmp.eq		p12, p0 = 3, r14	C M I
    81  	;;
    82  }
    83  {.mii
    84  	cmp.ne		p6, p7 = r0, r0		C M I
    85  	mov.i		ar.lc = r31		C I0
    86  	cmp.ne		p8, p9 = r0, r0		C M I
    87  }
    88  {.bbb
    89    (p10)	br.dptk		.Lb00			C B
    90    (p11)	br.dptk		.Lb10			C B
    91    (p12)	br.dptk		.Lb11			C B
    92  	;;
    93  }
    94  
    95  .Lb01:	br.cloop.dptk	.grt1
    96  	;;
    97  	xma.l		f38 = f9, f6, f0
    98  	xma.hu		f39 = f9, f6, f0
    99  	;;
   100  	getf.sig	r26 = f38
   101  	getf.sig	r27 = f39
   102  	br		.Lcj1
   103  
   104  .grt1:	ldf8		f10 = [r33], 8
   105  	;;
   106  	ldf8		f11 = [r33], 8
   107  	;;
   108  	ldf8		f12 = [r33], 8
   109  	;;
   110  	xma.l		f38 = f9, f6, f0
   111  	xma.hu		f39 = f9, f6, f0
   112  	;;
   113  	ldf8		f13 = [r33], 8
   114  	;;
   115  	xma.l		f32 = f10, f6, f0
   116  	xma.hu		f33 = f10, f6, f0
   117  	br.cloop.dptk	.grt5
   118  
   119  	;;
   120  	getf.sig	r26 = f38
   121  	xma.l		f34 = f11, f6, f0
   122  	xma.hu		f35 = f11, f6, f0
   123  	;;
   124  	getf.sig	r27 = f39
   125  	;;
   126  	getf.sig	r20 = f32
   127  	xma.l		f36 = f12, f6, f0
   128  	xma.hu		f37 = f12, f6, f0
   129  	;;
   130  	getf.sig	r21 = f33
   131  	;;
   132  	getf.sig	r22 = f34
   133  	xma.l		f38 = f13, f6, f0
   134  	xma.hu		f39 = f13, f6, f0
   135  	br		.Lcj5
   136  
   137  .grt5:	ldf8		f10 = [r33], 8
   138  	;;
   139  	getf.sig	r26 = f38
   140  	xma.l		f34 = f11, f6, f0
   141  	xma.hu		f35 = f11, f6, f0
   142  	;;
   143  	getf.sig	r27 = f39
   144  	ldf8		f11 = [r33], 8
   145  	;;
   146  	getf.sig	r20 = f32
   147  	xma.l		f36 = f12, f6, f0
   148  	xma.hu		f37 = f12, f6, f0
   149  	;;
   150  	getf.sig	r21 = f33
   151  	ldf8		f12 = [r33], 8
   152  	;;
   153  	getf.sig	r22 = f34
   154  	xma.l		f38 = f13, f6, f0
   155  	xma.hu		f39 = f13, f6, f0
   156  	br		.LL01
   157  
   158  .Lb10:	ldf8		f13 = [r33], 8
   159  	br.cloop.dptk	.grt2
   160  	;;
   161  
   162  	xma.l		f36 = f9, f6, f0
   163  	xma.hu		f37 = f9, f6, f0
   164  	;;
   165  	xma.l		f38 = f13, f6, f0
   166  	xma.hu		f39 = f13, f6, f0
   167  	;;
   168  	getf.sig	r24 = f36
   169  	;;
   170  	getf.sig	r25 = f37
   171  	;;
   172  	getf.sig	r26 = f38
   173  	;;
   174  	getf.sig	r27 = f39
   175  	br		.Lcj2
   176  
   177  .grt2:	ldf8		f10 = [r33], 8
   178  	;;
   179  	ldf8		f11 = [r33], 8
   180  	;;
   181  	xma.l		f36 = f9, f6, f0
   182  	xma.hu		f37 = f9, f6, f0
   183  	;;
   184  	ldf8		f12 = [r33], 8
   185  	;;
   186  	xma.l		f38 = f13, f6, f0
   187  	xma.hu		f39 = f13, f6, f0
   188  	;;
   189  	ldf8		f13 = [r33], 8
   190  	;;
   191  	getf.sig	r24 = f36
   192  	xma.l		f32 = f10, f6, f0
   193  	xma.hu		f33 = f10, f6, f0
   194  	br.cloop.dptk	.grt6
   195  
   196  	getf.sig	r25 = f37
   197  	;;
   198  	getf.sig	r26 = f38
   199  	xma.l		f34 = f11, f6, f0
   200  	xma.hu		f35 = f11, f6, f0
   201  	;;
   202  	getf.sig	r27 = f39
   203  	;;
   204  	getf.sig	r20 = f32
   205  	xma.l		f36 = f12, f6, f0
   206  	xma.hu		f37 = f12, f6, f0
   207  	br		.Lcj6
   208  
   209  .grt6:	getf.sig	r25 = f37
   210  	ldf8		f10 = [r33], 8
   211  	;;
   212  	getf.sig	r26 = f38
   213  	xma.l		f34 = f11, f6, f0
   214  	xma.hu		f35 = f11, f6, f0
   215  	;;
   216  	getf.sig	r27 = f39
   217  	ldf8		f11 = [r33], 8
   218  	;;
   219  	getf.sig	r20 = f32
   220  	xma.l		f36 = f12, f6, f0
   221  	xma.hu		f37 = f12, f6, f0
   222  	br		.LL10
   223  
   224  
   225  .Lb11:	ldf8		f12 = [r33], 8
   226  	;;
   227  	ldf8		f13 = [r33], 8
   228  	br.cloop.dptk	.grt3
   229  	;;
   230  
   231  	xma.l		f34 = f9, f6, f0
   232  	xma.hu		f35 = f9, f6, f0
   233  	;;
   234  	xma.l		f36 = f12, f6, f0
   235  	xma.hu		f37 = f12, f6, f0
   236  	;;
   237  	getf.sig	r22 = f34
   238  	xma.l		f38 = f13, f6, f0
   239  	xma.hu		f39 = f13, f6, f0
   240  	;;
   241  	getf.sig	r23 = f35
   242  	;;
   243  	getf.sig	r24 = f36
   244  	;;
   245  	getf.sig	r25 = f37
   246  	;;
   247  	getf.sig	r26 = f38
   248  	br		.Lcj3
   249  
   250  .grt3:	ldf8		f10 = [r33], 8
   251  	;;
   252  	xma.l		f34 = f9, f6, f0
   253  	xma.hu		f35 = f9, f6, f0
   254  	;;
   255  	ldf8		f11 = [r33], 8
   256  	;;
   257  	xma.l		f36 = f12, f6, f0
   258  	xma.hu		f37 = f12, f6, f0
   259  	;;
   260  	ldf8		f12 = [r33], 8
   261  	;;
   262  	getf.sig	r22 = f34
   263  	xma.l		f38 = f13, f6, f0
   264  	xma.hu		f39 = f13, f6, f0
   265  	;;
   266  	getf.sig	r23 = f35
   267  	ldf8		f13 = [r33], 8
   268  	;;
   269  	getf.sig	r24 = f36
   270  	xma.l		f32 = f10, f6, f0
   271  	xma.hu		f33 = f10, f6, f0
   272  	br.cloop.dptk	.grt7
   273  
   274  	getf.sig	r25 = f37
   275  	;;
   276  	getf.sig	r26 = f38
   277  	xma.l		f34 = f11, f6, f0
   278  	xma.hu		f35 = f11, f6, f0
   279  	br		.Lcj7
   280  
   281  .grt7:	getf.sig	r25 = f37
   282  	ldf8		f10 = [r33], 8
   283  	;;
   284  	getf.sig	r26 = f38
   285  	xma.l		f34 = f11, f6, f0
   286  	xma.hu		f35 = f11, f6, f0
   287  	br		.LL11
   288  
   289  
   290  .Lb00:	ldf8		f11 = [r33], 8
   291  	;;
   292  	ldf8		f12 = [r33], 8
   293  	;;
   294  	ldf8		f13 = [r33], 8
   295  	br.cloop.dptk	.grt4
   296  	;;
   297  
   298  	xma.l		f32 = f9, f6, f0
   299  	xma.hu		f33 = f9, f6, f0
   300  	;;
   301  	xma.l		f34 = f11, f6, f0
   302  	xma.hu		f35 = f11, f6, f0
   303  	;;
   304  	getf.sig	r20 = f32
   305  	xma.l		f36 = f12, f6, f0
   306  	xma.hu		f37 = f12, f6, f0
   307  	;;
   308  	getf.sig	r21 = f33
   309  	;;
   310  	getf.sig	r22 = f34
   311  	xma.l		f38 = f13, f6, f0
   312  	xma.hu		f39 = f13, f6, f0
   313  	;;
   314  	getf.sig	r23 = f35
   315  	;;
   316  	getf.sig	r24 = f36
   317  	br		.Lcj4
   318  
   319  .grt4:	xma.l		f32 = f9, f6, f0
   320  	xma.hu		f33 = f9, f6, f0
   321  	;;
   322  	ldf8		f10 = [r33], 8
   323  	;;
   324  	xma.l		f34 = f11, f6, f0
   325  	xma.hu		f35 = f11, f6, f0
   326  	;;
   327  	ldf8		f11 = [r33], 8
   328  	;;
   329  	getf.sig	r20 = f32
   330  	xma.l		f36 = f12, f6, f0
   331  	xma.hu		f37 = f12, f6, f0
   332  	;;
   333  	getf.sig	r21 = f33
   334  	ldf8		f12 = [r33], 8
   335  	;;
   336  	getf.sig	r22 = f34
   337  	xma.l		f38 = f13, f6, f0
   338  	xma.hu		f39 = f13, f6, f0
   339  	;;
   340  	getf.sig	r23 = f35
   341  	ldf8		f13 = [r33], 8
   342  	;;
   343  	getf.sig	r24 = f36
   344  	xma.l		f32 = f10, f6, f0
   345  	xma.hu		f33 = f10, f6, f0
   346  	br.cloop.dptk	.LL00
   347  	br		.Lcj8
   348  
   349  C *** MAIN LOOP START ***
   350  	ALIGN(32)
   351  .Ltop:
   352  	.pred.rel "mutex",p6,p7
   353  C	.mfi
   354  	getf.sig	r24 = f36
   355  	xma.l		f32 = f10, f6, f0
   356    (p6)	sub		r15 = r19, r27, 1
   357  C	.mfi
   358  	st8		[r32] = r19, 8
   359  	xma.hu		f33 = f10, f6, f0
   360    (p7)	sub		r15 = r19, r27
   361  	;;
   362  .LL00:
   363  C	.mfi
   364  	getf.sig	r25 = f37
   365  	nop.f 0
   366  	cmp.ltu		p6, p7 = r15, r20
   367  C	.mib
   368  	ldf8		f10 = [r33], 8
   369  	sub		r16 = r15, r20
   370  	nop.b 0
   371  	;;
   372  
   373  C	.mfi
   374  	getf.sig	r26 = f38
   375  	xma.l		f34 = f11, f6, f0
   376    (p6)	sub		r15 = r16, r21, 1
   377  C	.mfi
   378  	st8		[r32] = r16, 8
   379  	xma.hu		f35 = f11, f6, f0
   380    (p7)	sub		r15 = r16, r21
   381  	;;
   382  .LL11:
   383  C	.mfi
   384  	getf.sig	r27 = f39
   385  	nop.f 0
   386  	cmp.ltu		p6, p7 = r15, r22
   387  C	.mib
   388  	ldf8		f11 = [r33], 8
   389  	sub		r17 = r15, r22
   390  	nop.b 0
   391  	;;
   392  
   393  C	.mfi
   394  	getf.sig	r20 = f32
   395  	xma.l		f36 = f12, f6, f0
   396    (p6)	sub		r15 = r17, r23, 1
   397  C	.mfi
   398  	st8		[r32] = r17, 8
   399  	xma.hu		f37 = f12, f6, f0
   400    (p7)	sub		r15 = r17, r23
   401  	;;
   402  .LL10:
   403  C	.mfi
   404  	getf.sig	r21 = f33
   405  	nop.f 0
   406  	cmp.ltu		p6, p7 = r15, r24
   407  C	.mib
   408  	ldf8		f12 = [r33], 8
   409  	sub		r18 = r15, r24
   410  	nop.b 0
   411  	;;
   412  
   413  C	.mfi
   414  	getf.sig	r22 = f34
   415  	xma.l		f38 = f13, f6, f0
   416    (p6)	sub		r15 = r18, r25, 1
   417  C	.mfi
   418  	st8		[r32] = r18, 8
   419  	xma.hu		f39 = f13, f6, f0
   420    (p7)	sub		r15 = r18, r25
   421  	;;
   422  .LL01:
   423  C	.mfi
   424  	getf.sig	r23 = f35
   425  	nop.f 0
   426  	cmp.ltu		p6, p7 = r15, r26
   427  C	.mib
   428  	ldf8		f13 = [r33], 8
   429  	sub		r19 = r15, r26
   430  	br.cloop.sptk.few .Ltop
   431  C *** MAIN LOOP END ***
   432  	;;
   433  
   434  	getf.sig	r24 = f36
   435  	xma.l		f32 = f10, f6, f0
   436    (p6)	sub		r15 = r19, r27, 1
   437  	st8		[r32] = r19, 8
   438  	xma.hu		f33 = f10, f6, f0
   439    (p7)	sub		r15 = r19, r27
   440  	;;
   441  .Lcj8:	getf.sig	r25 = f37
   442  	cmp.ltu		p6, p7 = r15, r20
   443  	sub		r16 = r15, r20
   444  	;;
   445  	getf.sig	r26 = f38
   446  	xma.l		f34 = f11, f6, f0
   447    (p6)	sub		r15 = r16, r21, 1
   448  	st8		[r32] = r16, 8
   449  	xma.hu		f35 = f11, f6, f0
   450    (p7)	sub		r15 = r16, r21
   451  	;;
   452  .Lcj7:	getf.sig	r27 = f39
   453  	cmp.ltu		p6, p7 = r15, r22
   454  	sub		r17 = r15, r22
   455  	;;
   456  	getf.sig	r20 = f32
   457  	xma.l		f36 = f12, f6, f0
   458    (p6)	sub		r15 = r17, r23, 1
   459  	st8		[r32] = r17, 8
   460  	xma.hu		f37 = f12, f6, f0
   461    (p7)	sub		r15 = r17, r23
   462  	;;
   463  .Lcj6:	getf.sig	r21 = f33
   464  	cmp.ltu		p6, p7 = r15, r24
   465  	sub		r18 = r15, r24
   466  	;;
   467  	getf.sig	r22 = f34
   468  	xma.l		f38 = f13, f6, f0
   469    (p6)	sub		r15 = r18, r25, 1
   470  	st8		[r32] = r18, 8
   471  	xma.hu		f39 = f13, f6, f0
   472    (p7)	sub		r15 = r18, r25
   473  	;;
   474  .Lcj5:	getf.sig	r23 = f35
   475  	cmp.ltu		p6, p7 = r15, r26
   476  	sub		r19 = r15, r26
   477  	;;
   478  	getf.sig	r24 = f36
   479    (p6)	sub		r15 = r19, r27, 1
   480  	st8		[r32] = r19, 8
   481    (p7)	sub		r15 = r19, r27
   482  	;;
   483  .Lcj4:	getf.sig	r25 = f37
   484  	cmp.ltu		p6, p7 = r15, r20
   485  	sub		r16 = r15, r20
   486  	;;
   487  	getf.sig	r26 = f38
   488    (p6)	sub		r15 = r16, r21, 1
   489  	st8		[r32] = r16, 8
   490    (p7)	sub		r15 = r16, r21
   491  	;;
   492  .Lcj3:	getf.sig	r27 = f39
   493  	cmp.ltu		p6, p7 = r15, r22
   494  	sub		r17 = r15, r22
   495  	;;
   496    (p6)	sub		r15 = r17, r23, 1
   497  	st8		[r32] = r17, 8
   498    (p7)	sub		r15 = r17, r23
   499  	;;
   500  .Lcj2:	cmp.ltu		p6, p7 = r15, r24
   501  	sub		r18 = r15, r24
   502  	;;
   503    (p6)	sub		r15 = r18, r25, 1
   504  	st8		[r32] = r18, 8
   505    (p7)	sub		r15 = r18, r25
   506  	;;
   507  .Lcj1:	cmp.ltu		p6, p7 = r15, r26
   508  	sub		r19 = r15, r26
   509  	;;
   510    (p6)	sub		r8 = r19, r27, 1
   511  	st8		[r32] = r19
   512    (p7)	sub		r8 = r19, r27
   513  	mov ar.lc = r2
   514  	br.ret.sptk.many b0
   515  EPILOGUE()
   516  ASM_END()