github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/ia64/lshiftc.asm (about)

     1  dnl  IA-64 mpn_lshiftc.
     2  
     3  dnl  Contributed to the GNU project by Torbjorn Granlund.
     4  
     5  dnl  Copyright 2000-2005, 2010 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  C           cycles/limb
    36  C Itanium:      ?
    37  C Itanium 2:    1.25
    38  
    39  C This code is scheduled deeply since the plain shift instructions shr and shl
    40  C have a latency of 4 (on Itanium) or 3 (on Itanium 2).  Poor scheduling of
    41  C these instructions cause a 10 cycle replay trap on Itanium.
    42  
    43  C The ld8 scheduling should probably be decreased to make the function smaller.
    44  C Good lfetch  will make sure we never stall anyway.
    45  
    46  C We should actually issue the first ld8 at cycle 0, and the first BSH/FSH pair
    47  C at cycle 2.  Judicious use of predicates could allow us to issue more ld8's
    48  C in the prologue.
    49  
    50  
    51  C INPUT PARAMETERS
    52  define(`rp', `r32')
    53  define(`up', `r33')
    54  define(`n',  `r34')
    55  define(`cnt',`r35')
    56  
    57  define(`tnc',`r9')
    58  
    59  define(`FSH',`shl')
    60  define(`BSH',`shr.u')
    61  define(`UPD',`-8')
    62  define(`POFF',`-512')
    63  define(`PUPD',`-32')
    64  define(`func',`mpn_lshiftc')
    65  
    66  ASM_START()
    67  PROLOGUE(mpn_lshiftc)
    68  	.prologue
    69  	.save	ar.lc, r2
    70  	.body
    71  ifdef(`HAVE_ABI_32',
    72  `	addp4	rp = 0, rp		C				M I
    73  	addp4	up = 0, up		C				M I
    74  	sxt4	n = n			C				M I
    75  	nop.m		0
    76  	nop.m		0
    77  	zxt4	cnt = cnt		C				I
    78  	;;
    79  ')
    80  
    81   {.mmi;	nop	0			C				M I
    82  	and	r14 = 3, n		C				M I
    83  	mov.i	r2 = ar.lc		C				I0
    84  }{.mmi;	add	r15 = -1, n		C				M I
    85  	sub	tnc = 64, cnt		C				M I
    86  	nop	0
    87  	;;
    88  }{.mmi;	cmp.eq	p6, p0 = 1, r14		C				M I
    89  	cmp.eq	p7, p0 = 2, r14		C				M I
    90  	shr.u	n = r15, 2		C				I0
    91  }{.mmi;	cmp.eq	p8, p0 = 3, r14		C				M I
    92  	shladd	up = r15, 3, up		C				M I
    93  	shladd	rp = r15, 3, rp		C				M I
    94  	;;
    95  }{.mmi;	add	r11 = POFF, up		C				M I
    96  	ld8	r10 = [up], UPD		C				M01
    97  	mov.i	ar.lc = n		C				I0
    98  }{.bbb;
    99     (p6)	br.dptk	.Lb01
   100     (p7)	br.dptk	.Lb10
   101     (p8)	br.dptk	.Lb11
   102  	;; }
   103  
   104  .Lb00:
   105  	ld8	r19 = [up], UPD
   106  	;;
   107  	ld8	r16 = [up], UPD
   108  	;;
   109  	ld8	r17 = [up], UPD
   110  	BSH	r8 = r10, tnc
   111  	br.cloop.dptk	L(gt4)
   112  	;;
   113  	FSH	r24 = r10, cnt
   114  	BSH	r25 = r19, tnc
   115  	;;
   116  	FSH	r26 = r19, cnt
   117  	BSH	r27 = r16, tnc
   118  	;;
   119  	FSH	r20 = r16, cnt
   120  	BSH	r21 = r17, tnc
   121  	;;
   122  	or	r14 = r25, r24
   123  	FSH	r22 = r17, cnt
   124  	;;
   125  	or	r15 = r27, r26
   126  	sub	r31 = -1, r14
   127  	br	.Lr4
   128  
   129  L(gt4):
   130   {.mmi;	nop	0
   131  	nop	0
   132  	FSH	r24 = r10, cnt
   133  }{.mmi;	ld8	r18 = [up], UPD
   134  	nop	0
   135  	BSH	r25 = r19, tnc
   136  	;; }
   137   {.mmi;	nop	0
   138  	nop	0
   139  	FSH	r26 = r19, cnt
   140  }{.mmi;	ld8	r19 = [up], UPD
   141  	nop	0
   142  	BSH	r27 = r16, tnc
   143  	;; }
   144   {.mmi;	nop	0
   145  	nop	0
   146  	FSH	r20 = r16, cnt
   147  }{.mmi;	ld8	r16 = [up], UPD
   148  	nop	0
   149  	BSH	r21 = r17, tnc
   150  	;; }
   151   {.mmi;	nop	0
   152  	or	r14 = r25, r24
   153  	FSH	r22 = r17, cnt
   154  }{.mib;	ld8	r17 = [up], UPD
   155  	BSH	r23 = r18, tnc
   156  	br.cloop.dptk	L(gt8)
   157  	;; }
   158   {.mmi;	nop	0
   159  	or	r15 = r27, r26
   160  	FSH	r24 = r18, cnt
   161  }{.mib;	sub	r31 = -1, r14
   162  	BSH	r25 = r19, tnc
   163  	br	.Lr8 }
   164  
   165  L(gt8):
   166  	or	r15 = r27, r26
   167  	FSH	r24 = r18, cnt
   168  	ld8	r18 = [up], UPD
   169  	sub	r31 = -1, r14
   170  	BSH	r25 = r19, tnc
   171  	br	.LL00
   172  
   173  .Lb01:
   174  	br.cloop.dptk	L(gt1)
   175  	;;
   176  	BSH	r8 = r10, tnc
   177  	FSH	r22 = r10, cnt
   178  	;;
   179  	sub	r31 = -1, r22
   180  	br	.Lr1
   181  	;;
   182  L(gt1):
   183  	ld8	r18 = [up], UPD
   184  	BSH	r8 = r10, tnc
   185  	FSH	r22 = r10, cnt
   186  	;;
   187  	ld8	r19 = [up], UPD
   188  	;;
   189  	ld8	r16 = [up], UPD
   190  	;;
   191  	ld8	r17 = [up], UPD
   192  	BSH	r23 = r18, tnc
   193  	br.cloop.dptk	L(gt5)
   194  	;;
   195  	nop	0
   196  	FSH	r24 = r18, cnt
   197  	BSH	r25 = r19, tnc
   198  	;;
   199  	nop	0
   200  	FSH	r26 = r19, cnt
   201  	BSH	r27 = r16, tnc
   202  	;;
   203  	or	r15 = r23, r22
   204  	FSH	r20 = r16, cnt
   205  	BSH	r21 = r17, tnc
   206  	;;
   207  	or	r14 = r25, r24
   208  	FSH	r22 = r17, cnt
   209  	sub	r31 = -1, r15
   210  	br	.Lr5
   211  
   212  L(gt5):
   213   {.mmi;	nop	0
   214  	nop	0
   215  	FSH	r24 = r18, cnt
   216  }{.mmi;	ld8	r18 = [up], UPD
   217  	nop	0
   218  	BSH	r25 = r19, tnc
   219  	;; }
   220   {.mmi;	nop	0
   221  	nop	0
   222  	FSH	r26 = r19, cnt
   223  }{.mmi;	ld8	r19 = [up], UPD
   224  	nop	0
   225  	BSH	r27 = r16, tnc
   226  	;; }
   227   {.mmi;	nop	0
   228  	or	r15 = r23, r22
   229  	FSH	r20 = r16, cnt
   230  }{.mmi;	ld8	r16 = [up], UPD
   231  	nop	0
   232  	BSH	r21 = r17, tnc
   233  	;; }
   234   {.mmi;	or	r14 = r25, r24
   235  	sub	r31 = -1, r15
   236  	FSH	r22 = r17, cnt
   237  }{.mib;	ld8	r17 = [up], UPD
   238  	BSH	r23 = r18, tnc
   239  	br	L(end)
   240  	;; }
   241  
   242  .Lb10:
   243  	ld8	r17 = [up], UPD
   244  	br.cloop.dptk	L(gt2)
   245  	;;
   246  	BSH	r8 = r10, tnc
   247  	FSH	r20 = r10, cnt
   248  	;;
   249  	BSH	r21 = r17, tnc
   250  	FSH	r22 = r17, cnt
   251  	;;
   252  	or	r14 = r21, r20
   253  	;;
   254  	sub	r31 = -1, r14
   255  	br	.Lr2
   256  	;;
   257  L(gt2):
   258  	ld8	r18 = [up], UPD
   259  	BSH	r8 = r10, tnc
   260  	FSH	r20 = r10, cnt
   261  	;;
   262  	ld8	r19 = [up], UPD
   263  	;;
   264  	ld8	r16 = [up], UPD
   265  	BSH	r21 = r17, tnc
   266  	FSH	r22 = r17, cnt
   267  	;;
   268  	ld8	r17 = [up], UPD
   269  	BSH	r23 = r18, tnc
   270  	br.cloop.dptk	L(gt6)
   271  	;;
   272  	nop	0
   273  	FSH	r24 = r18, cnt
   274  	BSH	r25 = r19, tnc
   275  	;;
   276  	or	r14 = r21, r20
   277  	FSH	r26 = r19, cnt
   278  	BSH	r27 = r16, tnc
   279  	;;
   280   {.mmi;	nop	0
   281  	or	r15 = r23, r22
   282  	FSH	r20 = r16, cnt
   283  }{.mib;	sub	r31 = -1, r14
   284  	BSH	r21 = r17, tnc
   285  	br	.Lr6
   286  	;; }
   287  L(gt6):
   288   {.mmi;	nop	0
   289  	nop	0
   290  	FSH	r24 = r18, cnt
   291  }{.mmi;	ld8	r18 = [up], UPD
   292  	nop	0
   293  	BSH	r25 = r19, tnc
   294  	;; }
   295   {.mmi; nop   0
   296  	or	r14 = r21, r20
   297  	FSH	r26 = r19, cnt
   298  }{.mmi;	ld8	r19 = [up], UPD
   299  	nop	0
   300  	BSH	r27 = r16, tnc
   301  	;; }
   302   {.mmi;	or	r15 = r23, r22
   303  	sub	r31 = -1, r14
   304  	FSH	r20 = r16, cnt
   305  }{.mib;	ld8	r16 = [up], UPD
   306  	BSH	r21 = r17, tnc
   307  	br	.LL10
   308  }
   309  
   310  .Lb11:
   311  	ld8	r16 = [up], UPD
   312  	;;
   313  	ld8	r17 = [up], UPD
   314  	BSH	r8 = r10, tnc
   315  	FSH	r26 = r10, cnt
   316  	br.cloop.dptk	L(gt3)
   317  	;;
   318  	BSH	r27 = r16, tnc
   319  	;;
   320  	FSH	r20 = r16, cnt
   321  	BSH	r21 = r17, tnc
   322  	;;
   323  	FSH	r22 = r17, cnt
   324  	;;
   325  	or	r15 = r27, r26
   326  	;;
   327  	or	r14 = r21, r20
   328  	sub	r31 = -1, r15
   329  	br	.Lr3
   330  	;;
   331  L(gt3):
   332  	ld8	r18 = [up], UPD
   333  	;;
   334  	ld8	r19 = [up], UPD
   335  	BSH	r27 = r16, tnc
   336  	;;
   337   {.mmi;	nop	0
   338  	nop	0
   339  	FSH	r20 = r16, cnt
   340  }{.mmi;	ld8	r16 = [up], UPD
   341  	nop	0
   342  	BSH	r21 = r17, tnc
   343  	;;
   344  }{.mmi;	nop	0
   345  	nop	0
   346  	FSH	r22 = r17, cnt
   347  }{.mib;	ld8	r17 = [up], UPD
   348  	BSH	r23 = r18, tnc
   349  	br.cloop.dptk	L(gt7)
   350  	;; }
   351  	or	r15 = r27, r26
   352  	FSH	r24 = r18, cnt
   353  	BSH	r25 = r19, tnc
   354  	;;
   355   {.mmi;	nop	0
   356  	or	r14 = r21, r20
   357  	FSH	r26 = r19, cnt
   358  }{.mib;	sub	r31 = -1, r15
   359  	BSH	r27 = r16, tnc
   360  	br	.Lr7
   361  }
   362  L(gt7):
   363   {.mmi;	nop	0
   364  	or	r15 = r27, r26
   365  	FSH	r24 = r18, cnt
   366  }{.mmi;	ld8	r18 = [up], UPD
   367  	nop	0
   368  	BSH	r25 = r19, tnc
   369  	;; }
   370   {.mmi;	or	r14 = r21, r20
   371  	sub	r31 = -1, r15
   372  	FSH	r26 = r19, cnt
   373  }{.mib;	ld8	r19 = [up], UPD
   374  	BSH	r27 = r16, tnc
   375  	br	.LL11
   376  }
   377  
   378  C *** MAIN LOOP START ***
   379  	ALIGN(32)
   380  L(top):
   381  .LL01:
   382   {.mmi;	st8	[rp] = r31, UPD		C M2
   383  	or	r15 = r27, r26		C M3
   384  	FSH	r24 = r18, cnt		C I0
   385  }{.mmi;	ld8	r18 = [up], UPD		C M0
   386  	sub	r31 = -1, r14		C M1
   387  	BSH	r25 = r19, tnc		C I1
   388  	;; }
   389  .LL00:
   390   {.mmi;	st8	[rp] = r31, UPD
   391  	or	r14 = r21, r20
   392  	FSH	r26 = r19, cnt
   393  }{.mmi;	ld8	r19 = [up], UPD
   394  	sub	r31 = -1, r15
   395  	BSH	r27 = r16, tnc
   396  	;; }
   397  .LL11:
   398   {.mmi;	st8	[rp] = r31, UPD
   399  	or	r15 = r23, r22
   400  	FSH	r20 = r16, cnt
   401  }{.mmi;	ld8	r16 = [up], UPD
   402  	sub	r31 = -1, r14
   403  	BSH	r21 = r17, tnc
   404  	;; }
   405  .LL10:
   406   {.mmi;	st8	[rp] = r31, UPD
   407  	or	r14 = r25, r24
   408  	FSH	r22 = r17, cnt
   409  }{.mmi;	ld8	r17 = [up], UPD
   410  	sub	r31 = -1, r15
   411  	BSH	r23 = r18, tnc
   412  	;; }
   413  L(end):	lfetch		[r11], PUPD
   414  	br.cloop.dptk	L(top)
   415  C *** MAIN LOOP END ***
   416  
   417   {.mmi;	st8	[rp] = r31, UPD
   418  	or	r15 = r27, r26
   419  	FSH	r24 = r18, cnt
   420  }{.mib;	sub	r31 = -1, r14
   421  	BSH	r25 = r19, tnc
   422  	nop	0
   423  	;; }
   424  .Lr8:
   425   {.mmi;	st8	[rp] = r31, UPD
   426  	or	r14 = r21, r20
   427  	FSH	r26 = r19, cnt
   428  }{.mib;	sub	r31 = -1, r15
   429  	BSH	r27 = r16, tnc
   430  	nop	0
   431  	;; }
   432  .Lr7:
   433   {.mmi;	st8	[rp] = r31, UPD
   434  	or	r15 = r23, r22
   435  	FSH	r20 = r16, cnt
   436  }{.mib;	sub	r31 = -1, r14
   437  	BSH	r21 = r17, tnc
   438  	nop	0
   439  	;; }
   440  .Lr6:	st8	[rp] = r31, UPD
   441  	or	r14 = r25, r24
   442  	FSH	r22 = r17, cnt
   443  	sub	r31 = -1, r15
   444  	;;
   445  .Lr5:	st8	[rp] = r31, UPD
   446  	or	r15 = r27, r26
   447  	sub	r31 = -1, r14
   448  	;;
   449  .Lr4:	st8	[rp] = r31, UPD
   450  	or	r14 = r21, r20
   451  	sub	r31 = -1, r15
   452  	;;
   453  .Lr3:	st8	[rp] = r31, UPD
   454  	sub	r31 = -1, r14
   455  	;;
   456  .Lr2:	st8	[rp] = r31, UPD
   457  	sub	r31 = -1, r22
   458  	;;
   459  .Lr1:	st8	[rp] = r31, UPD		C				M23
   460  	mov	ar.lc = r2		C				I0
   461  	br.ret.sptk.many b0		C				B
   462  EPILOGUE(func)
   463  ASM_END()