github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/ia64/lorrshift.asm (about)

     1  dnl  IA-64 mpn_lshift/mpn_rshift.
     2  
     3  dnl  Contributed to the GNU project by Torbjorn Granlund.
     4  
     5  dnl  Copyright 2000-2005 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  C           cycles/limb
    36  C Itanium:      2
    37  C Itanium 2:    1
    38  
    39  C This code is scheduled deeply since the plain shift instructions shr and shl
    40  C have a latency of 4 (on Itanium) or 3 (on Itanium 2).  Poor scheduling of
    41  C these instructions cause a 10 cycle replay trap on Itanium.
    42  
    43  C The ld8 scheduling should probably be decreased to make the function smaller.
    44  C Good lfetch  will make sure we never stall anyway.
    45  
    46  C We should actually issue the first ld8 at cycle 0, and the first BSH/FSH pair
    47  C at cycle 2.  Judicious use of predicates could allow us to issue more ld8's
    48  C in the prologue.
    49  
    50  
    51  C INPUT PARAMETERS
    52  define(`rp', `r32')
    53  define(`up', `r33')
    54  define(`n',  `r34')
    55  define(`cnt',`r35')
    56  
    57  define(`tnc',`r9')
    58  
    59  ifdef(`OPERATION_lshift',`
    60  	define(`FSH',`shl')
    61  	define(`BSH',`shr.u')
    62  	define(`UPD',`-8')
    63  	define(`POFF',`-512')
    64  	define(`PUPD',`-32')
    65  	define(`func',`mpn_lshift')
    66  ')
    67  ifdef(`OPERATION_rshift',`
    68  	define(`FSH',`shr.u')
    69  	define(`BSH',`shl')
    70  	define(`UPD',`8')
    71  	define(`POFF',`512')
    72  	define(`PUPD',`32')
    73  	define(`func',`mpn_rshift')
    74  ')
    75  
    76  MULFUNC_PROLOGUE(mpn_lshift mpn_rshift)
    77  
    78  ASM_START()
    79  PROLOGUE(func)
    80  	.prologue
    81  	.save	ar.lc, r2
    82  	.body
    83  ifdef(`HAVE_ABI_32',
    84  `	addp4	rp = 0, rp		C			M I
    85  	addp4	up = 0, up		C		M I
    86  	sxt4	n = n			C		M I
    87  	nop.m		0
    88  	nop.m		0
    89  	zxt4	cnt = cnt		C		I
    90  	;;
    91  ')
    92  
    93   {.mmi;	cmp.lt	p14, p15 = 4, n		C		M I
    94  	and	r14 = 3, n		C		M I
    95  	mov.i	r2 = ar.lc		C		I0
    96  }{.mmi;	add	r15 = -1, n		C		M I
    97  	sub	tnc = 64, cnt		C		M I
    98  	add	r16 = -5, n
    99  	;;
   100  }{.mmi;	cmp.eq	p6, p0 = 1, r14		C		M I
   101  	cmp.eq	p7, p0 = 2, r14		C		M I
   102  	shr.u	n = r16, 2		C		I0
   103  }{.mmi;	cmp.eq	p8, p0 = 3, r14		C		M I
   104  ifdef(`OPERATION_lshift',
   105  `	shladd	up = r15, 3, up		C		M I
   106  	shladd	rp = r15, 3, rp')	C		M I
   107  	;;
   108  }{.mmi;	add	r11 = POFF, up		C		M I
   109  	ld8	r10 = [up], UPD		C		M01
   110  	mov.i	ar.lc = n		C		I0
   111  }{.bbb;
   112     (p6)	br.dptk	.Lb01
   113     (p7)	br.dptk	.Lb10
   114     (p8)	br.dptk	.Lb11
   115  	;; }
   116  
   117  .Lb00:	ld8	r19 = [up], UPD
   118  	;;
   119  	ld8	r16 = [up], UPD
   120  	;;
   121  	ld8	r17 = [up], UPD
   122  	BSH	r8 = r10, tnc		C function return value
   123  	;;
   124  	FSH	r24 = r10, cnt
   125  	BSH	r25 = r19, tnc
   126    (p14)	br.cond.dptk	.grt4
   127  	;;
   128  	FSH	r26 = r19, cnt
   129  	BSH	r27 = r16, tnc
   130  	;;
   131  	FSH	r20 = r16, cnt
   132  	BSH	r21 = r17, tnc
   133  	;;
   134  	or	r14 = r25, r24
   135  	FSH	r22 = r17, cnt
   136  	BSH	r23 = r10, tnc
   137  	br	.Lr4
   138  
   139  .grt4:	ld8	r18 = [up], UPD
   140  	FSH	r26 = r19, cnt
   141  	BSH	r27 = r16, tnc
   142  	;;
   143  	ld8	r19 = [up], UPD
   144  	FSH	r20 = r16, cnt
   145  	BSH	r21 = r17, tnc
   146  	;;
   147  	ld8	r16 = [up], UPD
   148  	FSH	r22 = r17, cnt
   149  	BSH	r23 = r18, tnc
   150  	;;
   151  	or	r14 = r25, r24
   152  	ld8	r17 = [up], UPD
   153  	br.cloop.dpnt	.Ltop
   154  	br	.Lbot
   155  
   156  .Lb01:
   157    (p15)	BSH	r8 = r10, tnc		C function return value	I
   158    (p15)	FSH	r22 = r10, cnt		C		I
   159    (p15)	br.cond.dptk	.Lr1		C return	B
   160  
   161  .grt1:	ld8	r18 = [up], UPD
   162  	;;
   163  	ld8	r19 = [up], UPD
   164  	BSH	r8 = r10, tnc		C function return value
   165  	;;
   166  	ld8	r16 = [up], UPD
   167  	FSH	r22 = r10, cnt
   168  	BSH	r23 = r18, tnc
   169  	;;
   170  	ld8	r17 = [up], UPD
   171  	FSH	r24 = r18, cnt
   172  	BSH	r25 = r19, tnc
   173  	br.cloop.dpnt	.grt5
   174  	;;
   175  	or	r15 = r23, r22
   176  	FSH	r26 = r19, cnt
   177  	BSH	r27 = r16, tnc
   178  	;;
   179  	FSH	r20 = r16, cnt
   180  	BSH	r21 = r17, tnc
   181  	br	.Lr5
   182  
   183  .grt5:	ld8	r18 = [up], UPD
   184  	FSH	r26 = r19, cnt
   185  	BSH	r27 = r16, tnc
   186  	;;
   187  	ld8	r19 = [up], UPD
   188  	FSH	r20 = r16, cnt
   189  	BSH	r21 = r17, tnc
   190  	;;
   191  	or	r15 = r23, r22
   192  	ld8	r16 = [up], UPD
   193  	br	.LL01
   194  
   195  
   196  .Lb10:	ld8	r17 = [up], UPD
   197    (p14)	br.cond.dptk	.grt2
   198  
   199  	BSH	r8 = r10, tnc		C function return value
   200  	;;
   201  	FSH	r20 = r10, cnt
   202  	BSH	r21 = r17, tnc
   203  	;;
   204  	or	r14 = r21, r20
   205  	FSH	r22 = r17, cnt
   206  	br	.Lr2			C return
   207  
   208  .grt2:	ld8	r18 = [up], UPD
   209  	BSH	r8 = r10, tnc		C function return value
   210  	;;
   211  	ld8	r19 = [up], UPD
   212  	FSH	r20 = r10, cnt
   213  	BSH	r21 = r17, tnc
   214  	;;
   215  	ld8	r16 = [up], UPD
   216  	FSH	r22 = r17, cnt
   217  	BSH	r23 = r18, tnc
   218  	;;
   219   {.mmi;	ld8	r17 = [up], UPD
   220  	or	r14 = r21, r20
   221  	FSH	r24 = r18, cnt
   222  }{.mib;	nop	0
   223  	BSH	r25 = r19, tnc
   224  	br.cloop.dpnt	.grt6
   225  	;; }
   226  
   227  	FSH	r26 = r19, cnt
   228  	BSH	r27 = r16, tnc
   229  	br	.Lr6
   230  
   231  .grt6:	ld8	r18 = [up], UPD
   232  	FSH	r26 = r19, cnt
   233  	BSH	r27 = r16, tnc
   234  	;;
   235  	ld8	r19 = [up], UPD
   236  	br	.LL10
   237  
   238  
   239  .Lb11:	ld8	r16 = [up], UPD
   240  	;;
   241  	ld8	r17 = [up], UPD
   242  	BSH	r8 = r10, tnc		C function return value
   243    (p14)	br.cond.dptk	.grt3
   244  	;;
   245  
   246  	FSH	r26 = r10, cnt
   247  	BSH	r27 = r16, tnc
   248  	;;
   249  	FSH	r20 = r16, cnt
   250  	BSH	r21 = r17, tnc
   251  	;;
   252  	or	r15 = r27, r26
   253  	FSH	r22 = r17, cnt
   254  	br	.Lr3			C return
   255  
   256  .grt3:	ld8	r18 = [up], UPD
   257  	FSH	r26 = r10, cnt
   258  	BSH	r27 = r16, tnc
   259  	;;
   260  	ld8	r19 = [up], UPD
   261  	FSH	r20 = r16, cnt
   262  	BSH	r21 = r17, tnc
   263  	;;
   264  	ld8	r16 = [up], UPD
   265  	FSH	r22 = r17, cnt
   266  	BSH	r23 = r18, tnc
   267  	;;
   268  	ld8	r17 = [up], UPD
   269  	br.cloop.dpnt	.grt7
   270  
   271  	or	r15 = r27, r26
   272  	FSH	r24 = r18, cnt
   273  	BSH	r25 = r19, tnc
   274  	br	.Lr7
   275  
   276  .grt7:	or	r15 = r27, r26
   277  	FSH	r24 = r18, cnt
   278  	BSH	r25 = r19, tnc
   279  	ld8	r18 = [up], UPD
   280  	br	.LL11
   281  
   282  C *** MAIN LOOP START ***
   283  	ALIGN(32)
   284  .Ltop:
   285   {.mmi;	st8	[rp] = r14, UPD		C M2
   286  	or	r15 = r27, r26		C M3
   287  	FSH	r24 = r18, cnt		C I0
   288  }{.mmi;	ld8	r18 = [up], UPD		C M1
   289  	lfetch	[r11], PUPD
   290  	BSH	r25 = r19, tnc		C I1
   291  	;; }
   292  .LL11:
   293   {.mmi;	st8	[rp] = r15, UPD
   294  	or	r14 = r21, r20
   295  	FSH	r26 = r19, cnt
   296  }{.mmi;	ld8	r19 = [up], UPD
   297  	nop.m	0
   298  	BSH	r27 = r16, tnc
   299  	;; }
   300  .LL10:
   301   {.mmi;	st8	[rp] = r14, UPD
   302  	or	r15 = r23, r22
   303  	FSH	r20 = r16, cnt
   304  }{.mmi;	ld8	r16 = [up], UPD
   305  	nop.m	0
   306  	BSH	r21 = r17, tnc
   307  	;; }
   308  .LL01:
   309   {.mmi;	st8	[rp] = r15, UPD
   310  	or	r14 = r25, r24
   311  	FSH	r22 = r17, cnt
   312  }{.mib;	ld8	r17 = [up], UPD
   313  	BSH	r23 = r18, tnc
   314  	br.cloop.dptk	.Ltop
   315  	;; }
   316  C *** MAIN LOOP END ***
   317  
   318  .Lbot:
   319   {.mmi;	st8	[rp] = r14, UPD
   320  	or	r15 = r27, r26
   321  	FSH	r24 = r18, cnt
   322  }{.mib;	nop	0
   323  	BSH	r25 = r19, tnc
   324  	nop	0
   325  	;; }
   326  .Lr7:
   327   {.mmi;	st8	[rp] = r15, UPD
   328  	or	r14 = r21, r20
   329  	FSH	r26 = r19, cnt
   330  }{.mib;	nop	0
   331  	BSH	r27 = r16, tnc
   332  	nop	0
   333  	;; }
   334  .Lr6:
   335   {.mmi;	st8	[rp] = r14, UPD
   336  	or	r15 = r23, r22
   337  	FSH	r20 = r16, cnt
   338  }{.mib;	nop	0
   339  	BSH	r21 = r17, tnc
   340  	nop	0
   341  	;; }
   342  .Lr5:	st8	[rp] = r15, UPD
   343  	or	r14 = r25, r24
   344  	FSH	r22 = r17, cnt
   345  	;;
   346  .Lr4:	st8	[rp] = r14, UPD
   347  	or	r15 = r27, r26
   348  	;;
   349  .Lr3:	st8	[rp] = r15, UPD
   350  	or	r14 = r21, r20
   351  	;;
   352  .Lr2:	st8	[rp] = r14, UPD
   353  	;;
   354  .Lr1:	st8	[rp] = r22, UPD		C		M23
   355  	mov	ar.lc = r2		C		I0
   356  	br.ret.sptk.many b0		C		B
   357  EPILOGUE(func)
   358  ASM_END()