github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k7/gcd_1.asm

github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k7/gcd_1.asm (about)

     1  dnl  x86 mpn_gcd_1 optimised for AMD K7.
     2  
     3  dnl  Contributed to the GNU project by by Kevin Ryde.  Rehacked by Torbjorn
     4  dnl  Granlund.
     5  
     6  dnl  Copyright 2000-2002, 2005, 2009, 2011, 2012, 2014, 2015 Free Software
     7  dnl  Foundation, Inc.
     8  
     9  dnl  This file is part of the GNU MP Library.
    10  dnl
    11  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    12  dnl  it under the terms of either:
    13  dnl
    14  dnl    * the GNU Lesser General Public License as published by the Free
    15  dnl      Software Foundation; either version 3 of the License, or (at your
    16  dnl      option) any later version.
    17  dnl
    18  dnl  or
    19  dnl
    20  dnl    * the GNU General Public License as published by the Free Software
    21  dnl      Foundation; either version 2 of the License, or (at your option) any
    22  dnl      later version.
    23  dnl
    24  dnl  or both in parallel, as here.
    25  dnl
    26  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    27  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    28  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    29  dnl  for more details.
    30  dnl
    31  dnl  You should have received copies of the GNU General Public License and the
    32  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    33  dnl  see https://www.gnu.org/licenses/.
    34  
    35  include(`../config.m4')
    36  
    37  
    38  C	     cycles/bit (approx)
    39  C AMD K7	 5.31
    40  C AMD K8,K9	 5.33
    41  C AMD K10	 5.30
    42  C AMD bd1	 ?
    43  C AMD bobcat	 7.02
    44  C Intel P4-2	10.1
    45  C Intel P4-3/4	10.0
    46  C Intel P6/13	 5.88
    47  C Intel core2	 6.26
    48  C Intel NHM	 6.83
    49  C Intel SBR	 8.50
    50  C Intel atom	 8.90
    51  C VIA nano	 ?
    52  C Numbers measured with: speed -CD -s16-32 -t16 mpn_gcd_1
    53  
    54  C TODO
    55  C  * Tune overhead, this takes 2-3 cycles more than old code when v0 is tiny.
    56  C  * Stream things better through registers, avoiding some copying.
    57  C  * For ELF, avoid putting GOT base in both ebx and esi.  Needs special
    58  C    LEA/LEAL or else discrete code here.
    59  
    60  C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
    61  
    62  deflit(MAXSHIFT, 6)
    63  deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
    64  
    65  DEF_OBJECT(ctz_table,64)
    66  	.byte	MAXSHIFT
    67  forloop(i,1,MASK,
    68  `	.byte	m4_count_trailing_zeros(i)
    69  ')
    70  END_OBJECT(ctz_table)
    71  
    72  C Threshold of when to call bmod when U is one limb.  Should be about
    73  C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
    74  define(`DIV_THRES_LOG2', 7)
    75  
    76  
    77  define(`up',    `%edi')
    78  define(`n',     `%esi')
    79  define(`v0',    `%edx')
    80  
    81  
    82  ASM_START()
    83  	TEXT
    84  	ALIGN(16)
    85  PROLOGUE(mpn_gcd_1)
    86  	push	%edi
    87  	push	%esi
    88  
    89  	mov	12(%esp), up
    90  	mov	16(%esp), n
    91  	mov	20(%esp), v0
    92  
    93  	mov	(up), %eax		C U low limb
    94  	or	v0, %eax		C x | y
    95  	mov	$-1, %ecx
    96  
    97  L(twos):
    98  	inc	%ecx
    99  	shr	%eax
   100  	jnc	L(twos)
   101  
   102  	shr	%cl, v0
   103  	mov	%ecx, %eax		C common twos
   104  
   105  L(divide_strip_y):
   106  	shr	v0
   107  	jnc	L(divide_strip_y)
   108  	adc	v0, v0
   109  
   110  	push	%eax
   111  	push	v0
   112  
   113  	cmp	$1, n
   114  	jnz	L(reduce_nby1)
   115  
   116  C Both U and V are single limbs, reduce with bmod if u0 >> v0.
   117  	mov	(up), %ecx
   118  	mov	%ecx, %eax
   119  	shr	$DIV_THRES_LOG2, %ecx
   120  	cmp	%ecx, v0
   121  	ja	L(reduced)
   122  
   123  	mov	v0, %esi
   124  	xor	%edx, %edx
   125  	div	%esi
   126  	mov	%edx, %eax
   127  	jmp	L(reduced)
   128  
   129  L(reduce_nby1):
   130  ifdef(`PIC_WITH_EBX',`dnl
   131  	push	%ebx
   132  	add	$-4, %esp
   133  	call	L(movl_eip_ebx)
   134  	add	$_GLOBAL_OFFSET_TABLE_, %ebx
   135  ')
   136  	push	v0			C param 3
   137  	push	n			C param 2
   138  	push	up			C param 1
   139  	cmp	$BMOD_1_TO_MOD_1_THRESHOLD, n
   140  	jl	L(bmod)
   141  	CALL(	mpn_mod_1)
   142  	jmp	L(called)
   143  L(bmod):
   144  	CALL(	mpn_modexact_1_odd)
   145  
   146  L(called):
   147  ifdef(`PIC_WITH_EBX',`dnl
   148  	add	$16, %esp	C deallocate params
   149  	pop	%ebx
   150  ',`
   151  	add	$12, %esp		C deallocate params
   152  ')
   153  L(reduced):
   154  	pop	%edx
   155  
   156  	LEAL(	ctz_table, %esi)
   157  	test	%eax, %eax
   158  	mov	%eax, %ecx
   159  	jnz	L(mid)
   160  	jmp	L(end)
   161  
   162  	ALIGN(16)			C               K8    BC    P4    NHM   SBR
   163  L(top):	cmovc(	%ecx, %eax)		C if x-y < 0	0
   164  	cmovc(	%edi, %edx)		C use x,y-x	0
   165  L(mid):	and	$MASK, %ecx		C		0
   166  	movzbl	(%esi,%ecx), %ecx	C		1
   167  	jz	L(shift_alot)		C		1
   168  	shr	%cl, %eax		C		3
   169  	mov	%eax, %edi		C		4
   170  	mov	%edx, %ecx		C		3
   171  	sub	%eax, %ecx		C		4
   172  	sub	%edx, %eax		C		4
   173  	jnz	L(top)			C		5
   174  
   175  L(end):	pop	%ecx
   176  	mov	%edx, %eax
   177  	shl	%cl, %eax
   178  	pop	%esi
   179  	pop	%edi
   180  	ret
   181  
   182  L(shift_alot):
   183  	shr	$MAXSHIFT, %eax
   184  	mov	%eax, %ecx
   185  	jmp	L(mid)
   186  
   187  ifdef(`PIC_WITH_EBX',`dnl
   188  L(movl_eip_ebx):
   189  	mov	(%esp), %ebx
   190  	ret
   191  ')
   192  EPILOGUE()
   193  ASM_END()