github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/ia64/mul_1.asm (about)

     1  dnl  IA-64 mpn_mul_1, mpn_mul_1c -- Multiply a limb vector with a limb and
     2  dnl  store the result in a second limb vector.
     3  
     4  dnl  Contributed to the GNU project by Torbjorn Granlund.
     5  
     6  dnl  Copyright 2000-2004, 2006, 2007 Free Software Foundation, Inc.
     7  
     8  dnl  This file is part of the GNU MP Library.
     9  dnl
    10  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    11  dnl  it under the terms of either:
    12  dnl
    13  dnl    * the GNU Lesser General Public License as published by the Free
    14  dnl      Software Foundation; either version 3 of the License, or (at your
    15  dnl      option) any later version.
    16  dnl
    17  dnl  or
    18  dnl
    19  dnl    * the GNU General Public License as published by the Free Software
    20  dnl      Foundation; either version 2 of the License, or (at your option) any
    21  dnl      later version.
    22  dnl
    23  dnl  or both in parallel, as here.
    24  dnl
    25  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    26  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    27  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    28  dnl  for more details.
    29  dnl
    30  dnl  You should have received copies of the GNU General Public License and the
    31  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    32  dnl  see https://www.gnu.org/licenses/.
    33  
    34  include(`../config.m4')
    35  
    36  C         cycles/limb
    37  C Itanium:    4.0
    38  C Itanium 2:  2.0
    39  
    40  C TODO
    41  C  * Further optimize feed-in and wind-down code, both for speed and code size.
    42  C  * Handle low limb input and results specially, using a common stf8 in the
    43  C    epilogue.
    44  C  * Use 1 c/l carry propagation scheme in wind-down code.
    45  C  * Use extra pointer register for `up' to speed up feed-in loads.
    46  C  * Work out final differences with addmul_1.asm.
    47  
    48  C INPUT PARAMETERS
    49  define(`rp', `r32')
    50  define(`up', `r33')
    51  define(`n', `r34')
    52  define(`vl', `r35')
    53  define(`cy', `r36')	C for mpn_mul_1c
    54  
    55  ASM_START()
    56  PROLOGUE(mpn_mul_1)
    57  	.prologue
    58  	.save	ar.lc, r2
    59  	.body
    60  
    61  ifdef(`HAVE_ABI_32',
    62  `	addp4		rp = 0, rp		C M I
    63  	addp4		up = 0, up		C M I
    64  	zxt4		n = n			C I
    65  	;;
    66  ')
    67  {.mfi
    68  	adds		r15 = -1, n		C M I
    69  	mov		f9 = f0			C F
    70  	mov.i		r2 = ar.lc		C I0
    71  }
    72  {.mmi
    73  	ldf8		f7 = [up], 8		C M
    74  	nop.m		0			C M
    75  	and		r14 = 3, n		C M I
    76  	;;
    77  }
    78  .Lcommon:
    79  {.mii
    80  	setf.sig	f6 = vl			C M2 M3
    81  	shr.u		r31 = r15, 2		C I0
    82  	cmp.eq		p10, p0 = 0, r14	C M I
    83  }
    84  {.mii
    85  	cmp.eq		p11, p0 = 2, r14	C M I
    86  	cmp.eq		p12, p0 = 3, r14	C M I
    87  	nop.i		0			C I
    88  	;;
    89  }
    90  {.mii
    91  	cmp.ne		p6, p7 = r0, r0		C M I
    92  	mov.i		ar.lc = r31		C I0
    93  	cmp.ne		p8, p9 = r0, r0		C M I
    94  }
    95  {.bbb
    96    (p10)	br.dptk		.Lb00			C B
    97    (p11)	br.dptk		.Lb10			C B
    98    (p12)	br.dptk		.Lb11			C B
    99  	;;
   100  }
   101  
   102  .Lb01:	mov		r20 = 0
   103  	br.cloop.dptk	.grt1			C B
   104  
   105  	xma.l		f39 = f7, f6, f9	C F
   106  	xma.hu		f43 = f7, f6, f9	C F
   107  	;;
   108  	getf.sig	r8 = f43		C M2
   109  	stf8		[rp] = f39		C M2 M3
   110  	mov.i		ar.lc = r2		C I0
   111  	br.ret.sptk.many b0			C B
   112  
   113  .grt1:
   114  	ldf8		f32 = [up], 8
   115  	;;
   116  	ldf8		f33 = [up], 8
   117  	;;
   118  	ldf8		f34 = [up], 8
   119  	xma.l		f39 = f7, f6, f9
   120  	xma.hu		f43 = f7, f6, f9
   121  	;;
   122  	ldf8		f35 = [up], 8
   123  	br.cloop.dptk	.grt5
   124  
   125  	xma.l		f36 = f32, f6, f0
   126  	xma.hu		f40 = f32, f6, f0
   127  	;;
   128  	stf8		[rp] = f39, 8
   129  	xma.l		f37 = f33, f6, f0
   130  	xma.hu		f41 = f33, f6, f0
   131  	;;
   132  	getf.sig	r21 = f43
   133  	getf.sig	r18 = f36
   134  	xma.l		f38 = f34, f6, f0
   135  	xma.hu		f42 = f34, f6, f0
   136  	;;
   137  	getf.sig	r22 = f40
   138  	getf.sig	r19 = f37
   139  	xma.l		f39 = f35, f6, f0
   140  	xma.hu		f43 = f35, f6, f0
   141  	;;
   142  	getf.sig	r23 = f41
   143  	getf.sig	r16 = f38
   144  	br		.Lcj5
   145  
   146  .grt5:
   147  	xma.l		f36 = f32, f6, f0
   148  	xma.hu		f40 = f32, f6, f0
   149  	;;
   150  	getf.sig	r17 = f39
   151  	ldf8		f32 = [up], 8
   152  	xma.l		f37 = f33, f6, f0
   153  	xma.hu		f41 = f33, f6, f0
   154  	;;
   155  	getf.sig	r21 = f43
   156  	ldf8		f33 = [up], 8
   157  	xma.l		f38 = f34, f6, f0
   158  	;;
   159  	getf.sig	r18 = f36
   160  	xma.hu		f42 = f34, f6, f0
   161  	;;
   162  	getf.sig	r22 = f40
   163  	ldf8		f34 = [up], 8
   164  	xma.l		f39 = f35, f6, f0
   165  	;;
   166  	getf.sig	r19 = f37
   167  	xma.hu		f43 = f35, f6, f0
   168  	br		.LL01
   169  
   170  
   171  .Lb10:	ldf8		f35 = [up], 8
   172  	mov		r23 = 0
   173  	br.cloop.dptk	.grt2
   174  
   175  	xma.l		f38 = f7, f6, f9
   176  	xma.hu		f42 = f7, f6, f9
   177  	;;
   178  	stf8		[rp] = f38, 8
   179  	xma.l		f39 = f35, f6, f42
   180  	xma.hu		f43 = f35, f6, f42
   181  	;;
   182  	getf.sig	r8 = f43
   183  	stf8		[rp] = f39
   184  	mov.i		ar.lc = r2
   185  	br.ret.sptk.many b0
   186  
   187  
   188  .grt2:
   189  	ldf8		f32 = [up], 8
   190  	;;
   191  	ldf8		f33 = [up], 8
   192  	xma.l		f38 = f7, f6, f9
   193  	xma.hu		f42 = f7, f6, f9
   194  	;;
   195  	ldf8		f34 = [up], 8
   196  	xma.l		f39 = f35, f6, f0
   197  	xma.hu		f43 = f35, f6, f0
   198  	;;
   199  	ldf8		f35 = [up], 8
   200  	br.cloop.dptk	.grt6
   201  
   202  	stf8		[rp] = f38, 8
   203  	xma.l		f36 = f32, f6, f0
   204  	xma.hu		f40 = f32, f6, f0
   205  	;;
   206  	getf.sig	r20 = f42
   207  	getf.sig	r17 = f39
   208  	xma.l		f37 = f33, f6, f0
   209  	xma.hu		f41 = f33, f6, f0
   210  	;;
   211  	getf.sig	r21 = f43
   212  	getf.sig	r18 = f36
   213  	xma.l		f38 = f34, f6, f0
   214  	xma.hu		f42 = f34, f6, f0
   215  	;;
   216  	getf.sig	r22 = f40
   217  	getf.sig	r19 = f37
   218  	xma.l		f39 = f35, f6, f0
   219  	xma.hu		f43 = f35, f6, f0
   220  	br		.Lcj6
   221  
   222  .grt6:
   223  	getf.sig	r16 = f38
   224  	xma.l		f36 = f32, f6, f0
   225  	xma.hu		f40 = f32, f6, f0
   226  	;;
   227  	getf.sig	r20 = f42
   228  	ldf8		f32 = [up], 8
   229  	xma.l		f37 = f33, f6, f0
   230  	;;
   231  	getf.sig	r17 = f39
   232  	xma.hu		f41 = f33, f6, f0
   233  	;;
   234  	getf.sig	r21 = f43
   235  	ldf8		f33 = [up], 8
   236  	xma.l		f38 = f34, f6, f0
   237  	;;
   238  	getf.sig	r18 = f36
   239  	xma.hu		f42 = f34, f6, f0
   240  	br		.LL10
   241  
   242  
   243  .Lb11:	ldf8		f34 = [up], 8
   244  	mov		r22 = 0
   245  	;;
   246  	ldf8		f35 = [up], 8
   247  	br.cloop.dptk	.grt3
   248  	;;
   249  
   250  	xma.l		f37 = f7, f6, f9
   251  	xma.hu		f41 = f7, f6, f9
   252  	xma.l		f38 = f34, f6, f0
   253  	xma.hu		f42 = f34, f6, f0
   254  	xma.l		f39 = f35, f6, f0
   255  	xma.hu		f43 = f35, f6, f0
   256  	;;
   257  	getf.sig	r23 = f41
   258  	stf8		[rp] = f37, 8
   259  	getf.sig	r16 = f38
   260  	getf.sig	r20 = f42
   261  	getf.sig	r17 = f39
   262  	getf.sig	r8 = f43
   263  	br		.Lcj3
   264  
   265  .grt3:
   266  	ldf8		f32 = [up], 8
   267  	xma.l		f37 = f7, f6, f9
   268  	xma.hu		f41 = f7, f6, f9
   269  	;;
   270  	ldf8		f33 = [up], 8
   271  	xma.l		f38 = f34, f6, f0
   272  	xma.hu		f42 = f34, f6, f0
   273  	;;
   274  	getf.sig	r19 = f37
   275  	ldf8		f34 = [up], 8
   276  	xma.l		f39 = f35, f6, f0
   277  	xma.hu		f43 = f35, f6, f0
   278  	;;
   279  	getf.sig	r23 = f41
   280  	ldf8		f35 = [up], 8
   281  	br.cloop.dptk	.grt7
   282  
   283  	getf.sig	r16 = f38
   284  	xma.l		f36 = f32, f6, f0
   285  	getf.sig	r20 = f42
   286  	xma.hu		f40 = f32, f6, f0
   287  	;;
   288  	getf.sig	r17 = f39
   289  	xma.l		f37 = f33, f6, f0
   290  	getf.sig	r21 = f43
   291  	xma.hu		f41 = f33, f6, f0
   292  	;;
   293  	getf.sig	r18 = f36
   294  	st8		[rp] = r19, 8
   295  	xma.l		f38 = f34, f6, f0
   296  	xma.hu		f42 = f34, f6, f0
   297  	br		.Lcj7
   298  
   299  .grt7:
   300  	getf.sig	r16 = f38
   301  	xma.l		f36 = f32, f6, f0
   302  	xma.hu		f40 = f32, f6, f0
   303  	;;
   304  	getf.sig	r20 = f42
   305  	ldf8		f32 = [up], 8
   306  	xma.l		f37 = f33, f6, f0
   307  	;;
   308  	getf.sig	r17 = f39
   309  	xma.hu		f41 = f33, f6, f0
   310  	br		.LL11
   311  
   312  
   313  .Lb00:	ldf8		f33 = [up], 8
   314  	mov		r21 = 0
   315  	;;
   316  	ldf8		f34 = [up], 8
   317  	;;
   318  	ldf8		f35 = [up], 8
   319  	xma.l		f36 = f7, f6, f9
   320  	xma.hu		f40 = f7, f6, f9
   321  	br.cloop.dptk	.grt4
   322  
   323  	xma.l		f37 = f33, f6, f0
   324  	xma.hu		f41 = f33, f6, f0
   325  	xma.l		f38 = f34, f6, f0
   326  	xma.hu		f42 = f34, f6, f0
   327  	;;
   328  	getf.sig	r22 = f40
   329  	stf8		[rp] = f36, 8
   330  	xma.l		f39 = f35, f6, f0
   331  	getf.sig	r19 = f37
   332  	xma.hu		f43 = f35, f6, f0
   333  	;;
   334  	getf.sig	r23 = f41
   335  	getf.sig	r16 = f38
   336  	getf.sig	r20 = f42
   337  	getf.sig	r17 = f39
   338  	br		.Lcj4
   339  
   340  .grt4:
   341  	ldf8		f32 = [up], 8
   342  	xma.l		f37 = f33, f6, f0
   343  	xma.hu		f41 = f33, f6, f0
   344  	;;
   345  	getf.sig	r18 = f36
   346  	ldf8		f33 = [up], 8
   347  	xma.l		f38 = f34, f6, f0
   348  	xma.hu		f42 = f34, f6, f0
   349  	;;
   350  	getf.sig	r22 = f40
   351  	ldf8		f34 = [up], 8
   352  	xma.l		f39 = f35, f6, f0
   353  	;;
   354  	getf.sig	r19 = f37
   355  	getf.sig	r23 = f41
   356  	xma.hu		f43 = f35, f6, f0
   357  	ldf8		f35 = [up], 8
   358  	br.cloop.dptk	.grt8
   359  
   360  	getf.sig	r16 = f38
   361  	xma.l		f36 = f32, f6, f0
   362  	getf.sig	r20 = f42
   363  	xma.hu		f40 = f32, f6, f0
   364  	;;
   365  	getf.sig	r17 = f39
   366  	st8		[rp] = r18, 8
   367  	xma.l		f37 = f33, f6, f0
   368  	xma.hu		f41 = f33, f6, f0
   369  	br		.Lcj8
   370  
   371  .grt8:
   372  	getf.sig	r16 = f38
   373  	xma.l		f36 = f32, f6, f0
   374  	xma.hu		f40 = f32, f6, f0
   375  	br		.LL00
   376  
   377  
   378  C *** MAIN LOOP START ***
   379  	ALIGN(32)
   380  .Loop:
   381  	.pred.rel "mutex",p6,p7
   382  	getf.sig	r16 = f38
   383  	xma.l		f36 = f32, f6, f0
   384     (p6)	cmp.leu		p8, p9 = r24, r17
   385  	st8		[rp] = r24, 8
   386  	xma.hu		f40 = f32, f6, f0
   387     (p7)	cmp.ltu		p8, p9 = r24, r17
   388  	;;
   389  .LL00:
   390  	.pred.rel "mutex",p8,p9
   391  	getf.sig	r20 = f42
   392     (p8)	add		r24 = r18, r21, 1
   393  	nop.b		0
   394  	ldf8		f32 = [up], 8
   395     (p9)	add		r24 = r18, r21
   396  	nop.b		0
   397  	;;
   398  	.pred.rel "mutex",p8,p9
   399  	getf.sig	r17 = f39
   400  	xma.l		f37 = f33, f6, f0
   401     (p8)	cmp.leu		p6, p7 = r24, r18
   402  	st8		[rp] = r24, 8
   403  	xma.hu		f41 = f33, f6, f0
   404     (p9)	cmp.ltu		p6, p7 = r24, r18
   405  	;;
   406  .LL11:
   407  	.pred.rel "mutex",p6,p7
   408  	getf.sig	r21 = f43
   409     (p6)	add		r24 = r19, r22, 1
   410  	nop.b		0
   411  	ldf8		f33 = [up], 8
   412     (p7)	add		r24 = r19, r22
   413  	nop.b		0
   414  	;;
   415  	.pred.rel "mutex",p6,p7
   416  	getf.sig	r18 = f36
   417  	xma.l		f38 = f34, f6, f0
   418     (p6)	cmp.leu		p8, p9 = r24, r19
   419  	st8		[rp] = r24, 8
   420  	xma.hu		f42 = f34, f6, f0
   421     (p7)	cmp.ltu		p8, p9 = r24, r19
   422  	;;
   423  .LL10:
   424  	.pred.rel "mutex",p8,p9
   425  	getf.sig	r22 = f40
   426     (p8)	add		r24 = r16, r23, 1
   427  	nop.b		0
   428  	ldf8		f34 = [up], 8
   429     (p9)	add		r24 = r16, r23
   430  	nop.b		0
   431  	;;
   432  	.pred.rel "mutex",p8,p9
   433  	getf.sig	r19 = f37
   434  	xma.l		f39 = f35, f6, f0
   435     (p8)	cmp.leu		p6, p7 = r24, r16
   436  	st8		[rp] = r24, 8
   437  	xma.hu		f43 = f35, f6, f0
   438     (p9)	cmp.ltu		p6, p7 = r24, r16
   439  	;;
   440  .LL01:
   441  	.pred.rel "mutex",p6,p7
   442  	getf.sig	r23 = f41
   443     (p6)	add		r24 = r17, r20, 1
   444  	nop.b		0
   445  	ldf8		f35 = [up], 8
   446     (p7)	add		r24 = r17, r20
   447  	br.cloop.dptk	.Loop
   448  C *** MAIN LOOP END ***
   449  	;;
   450  
   451  .Lcj9:
   452  	.pred.rel "mutex",p6,p7
   453  	getf.sig	r16 = f38
   454  	xma.l		f36 = f32, f6, f0
   455     (p6)	cmp.leu		p8, p9 = r24, r17
   456  	st8		[rp] = r24, 8
   457  	xma.hu		f40 = f32, f6, f0
   458     (p7)	cmp.ltu		p8, p9 = r24, r17
   459  	;;
   460  	.pred.rel "mutex",p8,p9
   461  	getf.sig	r20 = f42
   462     (p8)	add		r24 = r18, r21, 1
   463     (p9)	add		r24 = r18, r21
   464  	;;
   465  	.pred.rel "mutex",p8,p9
   466  	getf.sig	r17 = f39
   467  	xma.l		f37 = f33, f6, f0
   468     (p8)	cmp.leu		p6, p7 = r24, r18
   469  	st8		[rp] = r24, 8
   470  	xma.hu		f41 = f33, f6, f0
   471     (p9)	cmp.ltu		p6, p7 = r24, r18
   472  	;;
   473  .Lcj8:
   474  	.pred.rel "mutex",p6,p7
   475  	getf.sig	r21 = f43
   476     (p6)	add		r24 = r19, r22, 1
   477     (p7)	add		r24 = r19, r22
   478  	;;
   479  	.pred.rel "mutex",p6,p7
   480  	getf.sig	r18 = f36
   481  	xma.l		f38 = f34, f6, f0
   482     (p6)	cmp.leu		p8, p9 = r24, r19
   483  	st8		[rp] = r24, 8
   484  	xma.hu		f42 = f34, f6, f0
   485     (p7)	cmp.ltu		p8, p9 = r24, r19
   486  	;;
   487  .Lcj7:
   488  	.pred.rel "mutex",p8,p9
   489  	getf.sig	r22 = f40
   490     (p8)	add		r24 = r16, r23, 1
   491     (p9)	add		r24 = r16, r23
   492  	;;
   493  	.pred.rel "mutex",p8,p9
   494  	getf.sig	r19 = f37
   495  	xma.l		f39 = f35, f6, f0
   496     (p8)	cmp.leu		p6, p7 = r24, r16
   497  	st8		[rp] = r24, 8
   498  	xma.hu		f43 = f35, f6, f0
   499     (p9)	cmp.ltu		p6, p7 = r24, r16
   500  	;;
   501  .Lcj6:
   502  	.pred.rel "mutex",p6,p7
   503  	getf.sig	r23 = f41
   504     (p6)	add		r24 = r17, r20, 1
   505     (p7)	add		r24 = r17, r20
   506  	;;
   507  	.pred.rel "mutex",p6,p7
   508     (p6)	cmp.leu		p8, p9 = r24, r17
   509     (p7)	cmp.ltu		p8, p9 = r24, r17
   510  	getf.sig	r16 = f38
   511  	st8		[rp] = r24, 8
   512  	;;
   513  .Lcj5:
   514  	.pred.rel "mutex",p8,p9
   515  	getf.sig	r20 = f42
   516     (p8)	add		r24 = r18, r21, 1
   517     (p9)	add		r24 = r18, r21
   518  	;;
   519  	.pred.rel "mutex",p8,p9
   520     (p8)	cmp.leu		p6, p7 = r24, r18
   521     (p9)	cmp.ltu		p6, p7 = r24, r18
   522  	getf.sig	r17 = f39
   523  	st8		[rp] = r24, 8
   524  	;;
   525  .Lcj4:
   526  	.pred.rel "mutex",p6,p7
   527  	getf.sig	r8 = f43
   528     (p6)	add		r24 = r19, r22, 1
   529     (p7)	add		r24 = r19, r22
   530  	;;
   531  	.pred.rel "mutex",p6,p7
   532  	st8		[rp] = r24, 8
   533     (p6)	cmp.leu		p8, p9 = r24, r19
   534     (p7)	cmp.ltu		p8, p9 = r24, r19
   535  	;;
   536  .Lcj3:
   537  	.pred.rel "mutex",p8,p9
   538     (p8)	add		r24 = r16, r23, 1
   539     (p9)	add		r24 = r16, r23
   540  	;;
   541  	.pred.rel "mutex",p8,p9
   542  	st8		[rp] = r24, 8
   543     (p8)	cmp.leu		p6, p7 = r24, r16
   544     (p9)	cmp.ltu		p6, p7 = r24, r16
   545  	;;
   546  .Lcj2:
   547  	.pred.rel "mutex",p6,p7
   548     (p6)	add		r24 = r17, r20, 1
   549     (p7)	add		r24 = r17, r20
   550  	;;
   551  	.pred.rel "mutex",p6,p7
   552  	st8		[rp] = r24, 8
   553     (p6)	cmp.leu		p8, p9 = r24, r17
   554     (p7)	cmp.ltu		p8, p9 = r24, r17
   555  	;;
   556     (p8)	add		r8 = 1, r8
   557  	mov.i		ar.lc = r2
   558  	br.ret.sptk.many b0
   559  EPILOGUE()
   560  
   561  PROLOGUE(mpn_mul_1c)
   562  	.prologue
   563  	.save	ar.lc, r2
   564  	.body
   565  
   566  ifdef(`HAVE_ABI_32',
   567  `	addp4		rp = 0, rp		C M I
   568  	addp4		up = 0, up		C M I
   569  	zxt4		n = n			C I
   570  	;;
   571  ')
   572  {.mmi
   573  	adds		r15 = -1, n		C M I
   574  	setf.sig	f9 = cy			C M2 M3
   575  	mov.i		r2 = ar.lc		C I0
   576  }
   577  {.mmb
   578  	ldf8		f7 = [up], 8		C M
   579  	and		r14 = 3, n		C M I
   580  	br.sptk		.Lcommon
   581  	;;
   582  }
   583  EPILOGUE()
   584  ASM_END()