github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/alpha/ev67/gcd_1.asm (about)

     1  dnl  Alpha ev67 mpn_gcd_1 -- Nx1 greatest common divisor.
     2  
     3  dnl  Copyright 2003, 2004 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C ev67: 3.4 cycles/bitpair for 1x1 part
    35  
    36  
    37  C mp_limb_t mpn_gcd_1 (mp_srcptr xp, mp_size_t xsize, mp_limb_t y);
    38  C
    39  C In the 1x1 part, the algorithm is to change x,y to abs(x-y),min(x,y) and
    40  C strip trailing zeros from abs(x-y) to maintain x and y both odd.
    41  C
    42  C The trailing zeros are calculated from just x-y, since in twos-complement
    43  C there's the same number of trailing zeros on d or -d.  This means the cttz
    44  C runs in parallel with abs(x-y).
    45  C
    46  C The loop takes 5 cycles, and at 0.68 iterations per bit for two N-bit
    47  C operands with this algorithm gives the measured 3.4 c/l.
    48  C
    49  C The slottings shown are for SVR4 style systems, Unicos differs in the
    50  C initial gp setup and the LEA.
    51  C
    52  C Enhancement:
    53  C
    54  C On the jsr, !lituse_jsr! (when available) would allow the linker to relax
    55  C it to a bsr, but probably only in a static binary.  Plain "jsr foo" gives
    56  C the right object code for relaxation, and ought to be available
    57  C everywhere, but we prefer to schedule the GOT ldq (LEA) back earlier, for
    58  C the usual case of running in a shared library.
    59  C
    60  C bsr could perhaps be used explicitly anyway.  We should be able to assume
    61  C modexact is in the same module as us (ie. shared library or mainline).
    62  C Would there be any worries about the size of the displacement?  Could
    63  C always put modexact and gcd_1 in the same .o to be certain.
    64  
    65  ASM_START()
    66  PROLOGUE(mpn_gcd_1, gp)
    67  
    68  	C r16	xp
    69  	C r17	size
    70  	C r18	y
    71  
    72  	C ldah				C l
    73  	C lda				C u
    74  
    75  	ldq	r0, 0(r16)		C L   x = xp[0]
    76  	lda	r30, -32(r30)		C u   alloc stack
    77  
    78  	LEA(  r27, mpn_modexact_1c_odd)	C L   modexact addr, ldq (gp)
    79  	stq	r10, 16(r30)		C L   save r10
    80  	cttz	r18, r10		C U0  y twos
    81  	cmpeq	r17, 1, r5		C u   test size==1
    82  
    83  	stq	r9, 8(r30)		C L   save r9
    84  	clr	r19			C u   zero c for modexact
    85  	unop
    86  	unop
    87  
    88  	cttz	r0, r6			C U0  x twos
    89  	stq	r26, 0(r30)		C L   save ra
    90  
    91  	srl	r18, r10, r18		C U   y odd
    92  
    93  	mov	r18, r9			C l   hold y across call
    94  
    95  	cmpult	r6, r10, r2		C u   test x_twos < y_twos
    96  
    97  	cmovne	r2, r6, r10		C l   common_twos = min(x_twos,y_twos)
    98  	bne	r5, L(one)		C U   no modexact if size==1
    99  	jsr	r26, (r27), mpn_modexact_1c_odd   C L0
   100  
   101  	LDGP(	r29, 0(r26))		C u,l ldah,lda
   102  	cttz	r0, r6			C U0  new x twos
   103  	ldq	r26, 0(r30)		C L   restore ra
   104  
   105  L(one):
   106  	mov	r9, r1			C u   y
   107  	ldq	r9, 8(r30)		C L   restore r9
   108  	mov	r10, r2			C u   common twos
   109  	ldq	r10, 16(r30)		C L   restore r10
   110  
   111  	lda	r30, 32(r30)		C l   free stack
   112  	beq	r0, L(done)		C U   return y if x%y==0
   113  
   114  	srl	r0, r6, r0		C U   x odd
   115  	unop
   116  
   117  	ALIGN(16)
   118  L(top):
   119  	C r0	x
   120  	C r1	y
   121  	C r2	common twos, for use at end
   122  
   123  	subq	r0, r1, r7		C l0  d = x - y
   124  	cmpult	r0, r1, r16		C u0  test x >= y
   125  
   126  	subq	r1, r0, r4		C l0  new_x = y - x
   127  	cttz	r7, r8			C U0  d twos
   128  
   129  	cmoveq	r16, r7, r4		C l0  new_x = d if x>=y
   130  	cmovne	r16, r0, r1		C u0  y = x if x<y
   131  	unop				C l   \ force cmoveq into l0
   132  	unop				C u   /
   133  
   134  	C				C cmoveq2 L0, cmovne2 U0
   135  
   136  	srl	r4, r8, r0		C U0  x = new_x >> twos
   137  	bne	r7, L(top)		C U1  stop when d==0
   138  
   139  
   140  L(done):
   141  	sll	r1, r2, r0		C U0  return y << common_twos
   142  	ret	r31, (r26), 1		C L0
   143  
   144  EPILOGUE()
   145  ASM_END()