github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/ia64/mod_34lsub1.asm (about)

     1  dnl  IA-64 mpn_mod_34lsub1
     2  
     3  dnl  Contributed to the GNU project by Torbjorn Granlund.
     4  
     5  dnl  Copyright 2003-2005, 2010 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  C           cycles/limb
    36  C Itanium:      ?
    37  C Itanium 2:    1
    38  
    39  
    40  C INPUT PARAMETERS
    41  define(`up', `r32')
    42  define(`n',  `r33')
    43  
    44  C Some useful aliases for registers we use
    45  define(`u0',`r14') define(`u1',`r15') define(`u2',`r16')
    46  define(`a0',`r17') define(`a1',`r18') define(`a2',`r19')
    47  define(`c0',`r20') define(`c1',`r21') define(`c2',`r22')
    48  
    49  C This is a fairly simple-minded implementation.  One could approach 0.67 c/l
    50  C with a more sophisticated implementation.  If we're really crazy, we could
    51  C super-unroll, storing carries just in predicate registers, then copy them to
    52  C a general register, and population count them from there.  That'd bring us
    53  C close to 3 insn/limb, for nearly 0.5 c/l.
    54  
    55  C Computing n/3 needs 16 cycles, which is a lot of startup overhead.
    56  C We therefore use a plain while-style loop:
    57  C	add		n = -3, n
    58  C	cmp.le		p9, p0 = 3, n
    59  C  (p9)	br.cond		.Loop
    60  C Alternatively, we could table n/3 for, say, n < 256, and predicate the
    61  C 16-cycle code.
    62  
    63  C The summing-up code at the end was written quickly, and could surely be
    64  C vastly improved.
    65  
    66  ASM_START()
    67  PROLOGUE(mpn_mod_34lsub1)
    68  	.prologue
    69  	.save	ar.lc, r2
    70  	.body
    71  ifdef(`HAVE_ABI_32',`
    72  	addp4		up = 0, up		C			M I
    73  	nop.m		0
    74  	zxt4		n = n			C			I
    75  	;;
    76  ')
    77  
    78  ifelse(0,1,`
    79  	movl		r14 = 0xAAAAAAAAAAAAAAAB
    80  	;;
    81  	setf.sig	f6 = r14
    82  	setf.sig	f7 = r33
    83  	;;
    84  	xmpy.hu		f6 = f6, f7
    85  	;;
    86  	getf.sig	r8 = f6
    87  	;;
    88  	shr.u		r8 = r8, 1		C Loop count
    89  	;;
    90  	mov.i		ar.lc = r8
    91  ')
    92  
    93  	ld8	u0 = [up], 8
    94  	cmp.ne	p9, p0 = 1, n
    95    (p9)	br	L(gt1)
    96  	;;
    97  	shr.u	r8 = u0, 48
    98  	dep.z	r27 = u0, 0, 48
    99  	;;
   100  	add	r8 = r8, r27
   101  	br.ret.sptk.many b0
   102  
   103  
   104  L(gt1):
   105   {.mmi;	nop.m	0
   106  	mov	a0 = 0
   107  	add	n = -2, n
   108  }{.mmi;	mov	c0 = 0
   109  	mov	c1 = 0
   110  	mov	c2 = 0
   111  	;;
   112  }{.mmi;	ld8	u1 = [up], 8
   113  	mov	a1 = 0
   114  	cmp.ltu	p6, p0 = r0, r0		C clear p6
   115  }{.mmb;	cmp.gt	p9, p0 = 3, n
   116  	mov	a2 = 0
   117    (p9)	br.cond.dptk	L(end)
   118  	;;
   119  }
   120  	ALIGN(32)
   121  L(top):
   122   {.mmi;	ld8	u2 = [up], 8
   123    (p6)	add	c0 = 1, c0
   124  	cmp.ltu	p7, p0 = a0, u0
   125  }{.mmb;	sub	a0 = a0, u0
   126  	add	n = -3, n
   127  	nop.b	0
   128  	;;
   129  }{.mmi;	ld8	u0 = [up], 8
   130    (p7)	add	c1 = 1, c1
   131  	cmp.ltu	p8, p0 = a1, u1
   132  }{.mmb;	sub	a1 = a1, u1
   133  	cmp.le	p9, p0 = 3, n
   134  	nop.b	0
   135  	;;
   136  }{.mmi;	ld8	u1 = [up], 8
   137    (p8)	add	c2 = 1, c2
   138  	cmp.ltu	p6, p0 = a2, u2
   139  }{.mmb;	sub	a2 = a2, u2
   140  	nop.m	0
   141  dnl	br.cloop.dptk	L(top)
   142    (p9)	br.cond.dptk	L(top)
   143  	;;
   144  }
   145  L(end):
   146  	cmp.eq	p10, p0 = 0, n
   147  	cmp.eq	p11, p0 = 1, n
   148    (p10)	br	L(0)
   149  
   150  L(2):
   151   {.mmi;	ld8	u2 = [up], 8
   152    (p6)	add	c0 = 1, c0
   153  	cmp.ltu	p7, p0 = a0, u0
   154  }{.mmb;	sub	a0 = a0, u0
   155  	nop.m	0
   156    (p11)	br	L(1)
   157  	;;
   158  }	ld8	u0 = [up], 8
   159    (p7)	add	c1 = 1, c1
   160  	cmp.ltu	p8, p0 = a1, u1
   161  	sub	a1 = a1, u1
   162  	;;
   163    (p8)	add	c2 = 1, c2
   164  	cmp.ltu	p6, p0 = a2, u2
   165  	sub	a2 = a2, u2
   166  	;;
   167    (p6)	add	c0 = 1, c0
   168  	cmp.ltu	p7, p0 = a0, u0
   169  	sub	a0 = a0, u0
   170  	;;
   171    (p7)	add	c1 = 1, c1
   172  	br	L(com)
   173  
   174  
   175  L(1):
   176    (p7)	add	c1 = 1, c1
   177  	cmp.ltu	p8, p0 = a1, u1
   178  	sub	a1 = a1, u1
   179  	;;
   180    (p8)	add	c2 = 1, c2
   181  	cmp.ltu	p6, p0 = a2, u2
   182  	sub	a2 = a2, u2
   183  	;;
   184    (p6)	add	c0 = 1, c0
   185  	br	L(com)
   186  
   187  
   188  L(0):
   189    (p6)	add	c0 = 1, c0
   190  	cmp.ltu	p7, p0 = a0, u0
   191  	sub	a0 = a0, u0
   192  	;;
   193    (p7)	add	c1 = 1, c1
   194  	cmp.ltu	p8, p0 = a1, u1
   195  	sub	a1 = a1, u1
   196  	;;
   197    (p8)	add	c2 = 1, c2
   198  
   199  L(com):
   200  C |     a2    |     a1    |     a0    |
   201  C |        |        |        |        |
   202  	shr.u	r24 = a0, 48		C 16 bits
   203  	shr.u	r25 = a1, 32		C 32 bits
   204  	shr.u	r26 = a2, 16		C 48 bits
   205  	;;
   206  	shr.u	r10 = c0, 48		C 16 bits, always zero
   207  	shr.u	r11 = c1, 32		C 32 bits
   208  	shr.u	r30 = c2, 16		C 48 bits
   209  	;;
   210  	dep.z	r27 = a0,  0, 48	C 48 bits
   211  	dep.z	r28 = a1, 16, 32	C 48 bits
   212  	dep.z	r29 = a2, 32, 16	C 48 bits
   213  	dep.z	r31 = c0,  0, 48	C 48 bits
   214  	dep.z	r14 = c1, 16, 32	C 48 bits
   215  	dep.z	r15 = c2, 32, 16	C 48 bits
   216  	;;
   217   {.mmi;	add	r24 = r24, r25
   218  	add	r26 = r26, r27
   219  	add	r28 = r28, r29
   220  }{.mmi;	add	r10 = r10, r11
   221  	add	r30 = r30, r31
   222  	add	r14 = r14, r15
   223  	;;
   224  }
   225  	movl	r8 = 0xffffffffffff0
   226  	add	r24 = r24, r26
   227  	add	r10 = r10, r30
   228  	;;
   229  	add	r24 = r24, r28
   230  	add	r10 = r10, r14
   231  	;;
   232  	sub	r8 = r8, r24
   233  	;;
   234  	add	r8 = r8, r10
   235  	br.ret.sptk.many b0
   236  EPILOGUE()
   237  ASM_END()