github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k6/gcd_1.asm (about)

     1  dnl  AMD K6 mpn_gcd_1 -- mpn by 1 gcd.
     2  
     3  dnl  Copyright 2000-2002, 2004, 2014 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C K6: 9.5 cycles/bit (approx)   1x1 gcd
    35  C     11.0 cycles/limb          Nx1 reduction (modexact_1_odd)
    36  
    37  
    38  C mp_limb_t mpn_gcd_1 (mp_srcptr src, mp_size_t size, mp_limb_t y);
    39  C
    40  C This code is nothing very special, but offers a speedup over what gcc 2.95
    41  C can do with mpn/generic/gcd_1.c.
    42  C
    43  C Future:
    44  C
    45  C Using a lookup table to count trailing zeros seems a touch quicker, but
    46  C after a slightly longer startup.  Might be worthwhile if an mpn_gcd_2 used
    47  C it too.
    48  
    49  
    50  dnl  If size==1 and x (the larger operand) is more than DIV_THRESHOLD bits
    51  dnl  bigger than y, then a division x%y is done to reduce it.
    52  dnl
    53  dnl  A divl is 20 cycles and the loop runs at about 9.5 cycles/bitpair so
    54  dnl  there should be an advantage in the divl at about 4 or 5 bits, which is
    55  dnl  what's found.
    56  
    57  deflit(DIV_THRESHOLD, 5)
    58  
    59  
    60  defframe(PARAM_LIMB, 12)
    61  defframe(PARAM_SIZE,  8)
    62  defframe(PARAM_SRC,   4)
    63  
    64  	TEXT
    65  	ALIGN(16)
    66  
    67  PROLOGUE(mpn_gcd_1)
    68  deflit(`FRAME',0)
    69  
    70  	ASSERT(ne, `cmpl $0, PARAM_LIMB')
    71  	ASSERT(ae, `cmpl $1, PARAM_SIZE')
    72  
    73  
    74  	movl	PARAM_SRC, %eax
    75  	pushl	%ebx			FRAME_pushl()
    76  
    77  	movl	PARAM_LIMB, %edx
    78  	movl	$-1, %ecx
    79  
    80  	movl	(%eax), %ebx		C src low limb
    81  
    82  	movl	%ebx, %eax		C src low limb
    83  	orl	%edx, %ebx
    84  
    85  L(common_twos):
    86  	shrl	%ebx
    87  	incl	%ecx
    88  
    89  	jnc	L(common_twos)		C 1/4 chance on random data
    90  	shrl	%cl, %edx		C y
    91  
    92  	cmpl	$1, PARAM_SIZE
    93  	ja	L(size_two_or_more)
    94  
    95  
    96  	ASSERT(nz, `orl	%eax, %eax')	C should have src limb != 0
    97  
    98  	shrl	%cl, %eax		C x
    99  
   100  
   101  	C Swap if necessary to make x>=y.  Measures a touch quicker as a
   102  	C jump than a branch free calculation.
   103  	C
   104  	C eax	x
   105  	C ebx
   106  	C ecx	common twos
   107  	C edx	y
   108  
   109  	movl	%eax, %ebx
   110  	cmpl	%eax, %edx
   111  
   112  	jb	L(noswap)
   113  	movl	%edx, %eax
   114  
   115  	movl	%ebx, %edx
   116  	movl	%eax, %ebx
   117  L(noswap):
   118  
   119  
   120  	C See if it's worth reducing x with a divl.
   121  	C
   122  	C eax	x
   123  	C ebx	x
   124  	C ecx	common twos
   125  	C edx	y
   126  
   127  	shrl	$DIV_THRESHOLD, %ebx
   128  
   129  	cmpl	%ebx, %edx
   130  	ja	L(nodiv)
   131  
   132  
   133  	C Reduce x to x%y.
   134  	C
   135  	C eax	x
   136  	C ebx
   137  	C ecx	common twos
   138  	C edx	y
   139  
   140  	movl	%edx, %ebx
   141  	xorl	%edx, %edx
   142  
   143  	divl	%ebx
   144  
   145  	orl	%edx, %edx	C y
   146  	nop	C code alignment
   147  
   148  	movl	%ebx, %eax	C x
   149  	jz	L(done_shll)
   150  L(nodiv):
   151  
   152  
   153  	C eax	x
   154  	C ebx
   155  	C ecx	common twos
   156  	C edx	y
   157  	C esi
   158  	C edi
   159  	C ebp
   160  
   161  L(strip_y):
   162  	shrl	%edx
   163  	jnc	L(strip_y)
   164  
   165  	leal	1(%edx,%edx), %edx
   166  	movl	%ecx, %ebx	C common twos
   167  
   168  	leal	1(%eax), %ecx
   169  	jmp	L(strip_x_and)
   170  
   171  
   172  C Calculating a %cl shift based on the low bit 0 or 1 avoids doing a branch
   173  C on a 50/50 chance of 0 or 1.  The chance of the next bit also being 0 is
   174  C only 1/4.
   175  C
   176  C A second computed %cl shift was tried, but that measured a touch slower
   177  C than branching back.
   178  C
   179  C A branch-free abs(x-y) and min(x,y) calculation was tried, but that
   180  C measured about 1 cycle/bit slower.
   181  
   182  	C eax	x
   183  	C ebx	common twos
   184  	C ecx	scratch
   185  	C edx	y
   186  
   187  	ALIGN(4)
   188  L(swap):
   189  	addl	%eax, %edx	C x-y+y = x
   190  	negl	%eax		C -(x-y) = y-x
   191  
   192  L(strip_x):
   193  	shrl	%eax		C odd-odd = even, so always one to strip
   194  	ASSERT(nz)
   195  
   196  L(strip_x_leal):
   197  	leal	1(%eax), %ecx
   198  
   199  L(strip_x_and):
   200  	andl	$1, %ecx	C (x^1)&1
   201  
   202  	shrl	%cl, %eax	C shift if x even
   203  
   204  	testb	$1, %al
   205  	jz	L(strip_x)
   206  
   207  	ASSERT(nz,`testl $1, %eax')	C x, y odd
   208  	ASSERT(nz,`testl $1, %edx')
   209  
   210  	subl	%edx, %eax
   211  	jb	L(swap)
   212  	ja	L(strip_x)
   213  
   214  
   215  	movl	%edx, %eax
   216  	movl	%ebx, %ecx
   217  
   218  L(done_shll):
   219  	shll	%cl, %eax
   220  	popl	%ebx
   221  
   222  	ret
   223  
   224  
   225  C -----------------------------------------------------------------------------
   226  C Two or more limbs.
   227  C
   228  C x={src,size} is reduced modulo y using either a plain mod_1 style
   229  C remainder, or a modexact_1 style exact division.
   230  
   231  deflit(MODEXACT_THRESHOLD, ifdef(`PIC', 4, 4))
   232  
   233  	ALIGN(8)
   234  L(size_two_or_more):
   235  	C eax
   236  	C ebx
   237  	C ecx	common twos
   238  	C edx	y, without common twos
   239  	C esi
   240  	C edi
   241  	C ebp
   242  
   243  deflit(FRAME_TWO_OR_MORE, FRAME)
   244  
   245  	pushl	%edi		defframe_pushl(SAVE_EDI)
   246  	movl	PARAM_SRC, %ebx
   247  
   248  L(y_twos):
   249  	shrl	%edx
   250  	jnc	L(y_twos)
   251  
   252  	movl	%ecx, %edi		C common twos
   253  	movl	PARAM_SIZE, %ecx
   254  
   255  	pushl	%esi		defframe_pushl(SAVE_ESI)
   256  	leal	1(%edx,%edx), %esi	C y (odd)
   257  
   258  	movl	-4(%ebx,%ecx,4), %eax	C src high limb
   259  
   260  	cmpl	%edx, %eax		C carry if high<divisor
   261  
   262  	sbbl	%edx, %edx		C -1 if high<divisor
   263  
   264  	addl	%edx, %ecx		C skip one limb if high<divisor
   265  	andl	%eax, %edx
   266  
   267  	cmpl	$MODEXACT_THRESHOLD, %ecx
   268  	jae	L(modexact)
   269  
   270  
   271  L(divide_top):
   272  	C eax	scratch (quotient)
   273  	C ebx	src
   274  	C ecx	counter, size-1 to 1
   275  	C edx	carry (remainder)
   276  	C esi	divisor (odd)
   277  	C edi
   278  	C ebp
   279  
   280  	movl	-4(%ebx,%ecx,4), %eax
   281  	divl	%esi
   282  	loop	L(divide_top)
   283  
   284  
   285  	movl	%edx, %eax	C x
   286  	movl	%esi, %edx	C y (odd)
   287  
   288  	movl	%edi, %ebx	C common twos
   289  	popl	%esi
   290  
   291  	popl	%edi
   292  	leal	1(%eax), %ecx
   293  
   294  	orl	%eax, %eax
   295  	jnz	L(strip_x_and)
   296  
   297  
   298  	movl	%ebx, %ecx
   299  	movl	%edx, %eax
   300  
   301  	shll	%cl, %eax
   302  	popl	%ebx
   303  
   304  	ret
   305  
   306  
   307  	ALIGN(8)
   308  L(modexact):
   309  	C eax
   310  	C ebx	src ptr
   311  	C ecx	size or size-1
   312  	C edx
   313  	C esi	y odd
   314  	C edi	common twos
   315  	C ebp
   316  
   317  	movl	PARAM_SIZE, %eax
   318  	pushl	%esi		FRAME_pushl()
   319  
   320  	pushl	%eax		FRAME_pushl()
   321  
   322  	pushl	%ebx		FRAME_pushl()
   323  
   324  ifdef(`PIC_WITH_EBX',`
   325  	nop	C code alignment
   326  	call	L(movl_eip_ebx)
   327  	add	$_GLOBAL_OFFSET_TABLE_, %ebx
   328  ')
   329  	CALL(	mpn_modexact_1_odd)
   330  
   331  	movl	%esi, %edx		C y odd
   332  	movl	SAVE_ESI, %esi
   333  
   334  	movl	%edi, %ebx		C common twos
   335  	movl	SAVE_EDI, %edi
   336  
   337  	addl	$eval(FRAME - FRAME_TWO_OR_MORE), %esp
   338  	orl	%eax, %eax
   339  
   340  	leal	1(%eax), %ecx
   341  	jnz	L(strip_x_and)
   342  
   343  
   344  	movl	%ebx, %ecx
   345  	movl	%edx, %eax
   346  
   347  	shll	%cl, %eax
   348  	popl	%ebx
   349  
   350  	ret
   351  
   352  
   353  ifdef(`PIC_WITH_EBX',`
   354  L(movl_eip_ebx):
   355  	movl	(%esp), %ebx
   356  	ret_internal
   357  ')
   358  
   359  EPILOGUE()