github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/p6/gcd_1.asm (about)

     1  dnl  x86 mpn_gcd_1 optimised for processors with fast BSF.
     2  
     3  dnl  Based on the K7 gcd_1.asm, by Kevin Ryde.  Rehacked by Torbjorn Granlund.
     4  
     5  dnl  Copyright 2000-2002, 2005, 2009, 2011, 2012, 2015 Free Software
     6  dnl  Foundation, Inc.
     7  
     8  dnl  This file is part of the GNU MP Library.
     9  dnl
    10  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    11  dnl  it under the terms of either:
    12  dnl
    13  dnl    * the GNU Lesser General Public License as published by the Free
    14  dnl      Software Foundation; either version 3 of the License, or (at your
    15  dnl      option) any later version.
    16  dnl
    17  dnl  or
    18  dnl
    19  dnl    * the GNU General Public License as published by the Free Software
    20  dnl      Foundation; either version 2 of the License, or (at your option) any
    21  dnl      later version.
    22  dnl
    23  dnl  or both in parallel, as here.
    24  dnl
    25  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    26  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    27  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    28  dnl  for more details.
    29  dnl
    30  dnl  You should have received copies of the GNU General Public License and the
    31  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    32  dnl  see https://www.gnu.org/licenses/.
    33  
    34  include(`../config.m4')
    35  
    36  
    37  C	     cycles/bit (approx)
    38  C AMD K7	 7.80
    39  C AMD K8,K9	 7.79
    40  C AMD K10	 4.08
    41  C AMD bd1	 ?
    42  C AMD bobcat	 7.82
    43  C Intel P4-2	14.9
    44  C Intel P4-3/4	14.0
    45  C Intel P6/13	 5.09
    46  C Intel core2	 4.22
    47  C Intel NHM	 5.00
    48  C Intel SBR	 5.00
    49  C Intel atom	17.1
    50  C VIA nano	?
    51  C Numbers measured with: speed -CD -s16-32 -t16 mpn_gcd_1
    52  
    53  C Threshold of when to call bmod when U is one limb.  Should be about
    54  C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
    55  define(`BMOD_THRES_LOG2', 6)
    56  
    57  
    58  define(`up',    `%edi')
    59  define(`n',     `%esi')
    60  define(`v0',    `%edx')
    61  
    62  
    63  ASM_START()
    64  	TEXT
    65  	ALIGN(16)
    66  PROLOGUE(mpn_gcd_1)
    67  	push	%edi
    68  	push	%esi
    69  
    70  	mov	12(%esp), up
    71  	mov	16(%esp), n
    72  	mov	20(%esp), v0
    73  
    74  	mov	(up), %eax	C U low limb
    75  	or	v0, %eax
    76  	bsf	%eax, %eax	C min(ctz(u0),ctz(v0))
    77  
    78  	bsf	v0, %ecx
    79  	shr	%cl, v0
    80  
    81  	push	%eax		C preserve common twos over call
    82  	push	v0		C preserve v0 argument over call
    83  
    84  	cmp	$1, n
    85  	jnz	L(reduce_nby1)
    86  
    87  C Both U and V are single limbs, reduce with bmod if u0 >> v0.
    88  	mov	(up), %ecx
    89  	mov	%ecx, %eax
    90  	shr	$BMOD_THRES_LOG2, %ecx
    91  	cmp	%ecx, v0
    92  	ja	L(reduced)
    93  	jmp	L(bmod)
    94  
    95  L(reduce_nby1):
    96  	cmp	$BMOD_1_TO_MOD_1_THRESHOLD, n
    97  	jl	L(bmod)
    98  ifdef(`PIC_WITH_EBX',`dnl
    99  	push	%ebx
   100  	add	$-4, %esp
   101  	call	L(movl_eip_to_ebx)
   102  	add	$_GLOBAL_OFFSET_TABLE_, %ebx
   103  ')
   104  	push	v0		C param 3
   105  	push	n		C param 2
   106  	push	up		C param 1
   107  	CALL(	mpn_mod_1)
   108  	jmp	L(called)
   109  
   110  L(bmod):
   111  ifdef(`PIC_WITH_EBX',`dnl
   112  	push	%ebx
   113  	add	$-4, %esp
   114  	call	L(movl_eip_to_ebx)
   115  	add	$_GLOBAL_OFFSET_TABLE_, %ebx
   116  ')
   117  	push	v0		C param 3
   118  	push	n		C param 2
   119  	push	up		C param 1
   120  	CALL(	mpn_modexact_1_odd)
   121  
   122  L(called):
   123  ifdef(`PIC_WITH_EBX',`dnl
   124  	add	$16, %esp	C deallocate params
   125  	pop	%ebx
   126  ',`
   127  	add	$12, %esp	C deallocate params
   128  ')
   129  L(reduced):
   130  	pop	%edx
   131  
   132  	bsf	%eax, %ecx
   133  C	test	%eax, %eax	C FIXME: does this lower latency?
   134  	jnz	L(mid)
   135  	jmp	L(end)
   136  
   137  	ALIGN(16)		C               K10   BD    C2    NHM   SBR
   138  L(top):	cmovc(	%esi, %eax)	C if x-y < 0    0,3   0,3   0,6   0,5   0,5
   139  	cmovc(	%edi, %edx)	C use x,y-x     0,3   0,3   2,8   1,7   1,7
   140  L(mid):	shr	%cl, %eax	C               1,7   1,6   2,8   2,8   2,8
   141  	mov	%edx, %esi	C               1     1     4     3     3
   142  	sub	%eax, %esi	C               2     2     5     4     4
   143  	bsf	%esi, %ecx	C               3     3     6     5     5
   144  	mov	%eax, %edi	C               2     2     3     3     4
   145  	sub	%edx, %eax	C               2     2     4     3     4
   146  	jnz	L(top)		C
   147  
   148  L(end):	pop	%ecx
   149  	mov	%edx, %eax
   150  	shl	%cl, %eax
   151  
   152  	pop	%esi
   153  	pop	%edi
   154  	ret
   155  
   156  ifdef(`PIC_WITH_EBX',`dnl
   157  L(movl_eip_to_ebx):
   158  	mov	(%esp), %ebx
   159  	ret
   160  ')
   161  EPILOGUE()