github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/gcd_1.asm (about)

     1  dnl  AMD64 mpn_gcd_1 -- mpn by 1 gcd.
     2  
     3  dnl  Based on the K7 gcd_1.asm, by Kevin Ryde.  Rehacked for AMD64 by Torbjorn
     4  dnl  Granlund.
     5  
     6  dnl  Copyright 2000-2002, 2005, 2009, 2011, 2012 Free Software Foundation, Inc.
     7  
     8  dnl  This file is part of the GNU MP Library.
     9  dnl
    10  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    11  dnl  it under the terms of either:
    12  dnl
    13  dnl    * the GNU Lesser General Public License as published by the Free
    14  dnl      Software Foundation; either version 3 of the License, or (at your
    15  dnl      option) any later version.
    16  dnl
    17  dnl  or
    18  dnl
    19  dnl    * the GNU General Public License as published by the Free Software
    20  dnl      Foundation; either version 2 of the License, or (at your option) any
    21  dnl      later version.
    22  dnl
    23  dnl  or both in parallel, as here.
    24  dnl
    25  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    26  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    27  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    28  dnl  for more details.
    29  dnl
    30  dnl  You should have received copies of the GNU General Public License and the
    31  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    32  dnl  see https://www.gnu.org/licenses/.
    33  
    34  include(`../config.m4')
    35  
    36  
    37  C	     cycles/bit (approx)
    38  C AMD K8,K9	 5.21                 (4.95)
    39  C AMD K10	 5.15                 (5.00)
    40  C AMD bd1	 5.42                 (5.14)
    41  C AMD bobcat	 6.71                 (6.56)
    42  C Intel P4	13.5                 (12.75)
    43  C Intel core2	 6.20                 (6.16)
    44  C Intel NHM	 6.49                 (6.25)
    45  C Intel SBR	 7.75                 (7.57)
    46  C Intel atom	 8.77                 (8.54)
    47  C VIA nano	 6.60                 (6.20)
    48  C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1
    49  
    50  C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
    51  
    52  deflit(MAXSHIFT, 7)
    53  deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
    54  
    55  DEF_OBJECT(ctz_table,64)
    56  	.byte	MAXSHIFT
    57  forloop(i,1,MASK,
    58  `	.byte	m4_count_trailing_zeros(i)
    59  ')
    60  END_OBJECT(ctz_table)
    61  
    62  C Threshold of when to call bmod when U is one limb.  Should be about
    63  C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
    64  define(`BMOD_THRES_LOG2', 8)
    65  
    66  C INPUT PARAMETERS
    67  define(`up',    `%rdi')
    68  define(`n',     `%rsi')
    69  define(`v0',    `%rdx')
    70  
    71  ABI_SUPPORT(DOS64)
    72  ABI_SUPPORT(STD64)
    73  
    74  IFDOS(`define(`STACK_ALLOC', 40)')
    75  IFSTD(`define(`STACK_ALLOC', 8)')
    76  
    77  ASM_START()
    78  	TEXT
    79  	ALIGN(16)
    80  PROLOGUE(mpn_gcd_1)
    81  	FUNC_ENTRY(3)
    82  	mov	(up), %rax		C U low limb
    83  	mov	$-1, R32(%rcx)
    84  	or	v0, %rax		C x | y
    85  
    86  L(twos):
    87  	inc	R32(%rcx)
    88  	shr	%rax
    89  	jnc	L(twos)
    90  
    91  	shr	R8(%rcx), v0
    92  	push	%rcx			C common twos
    93  
    94  L(divide_strip_y):
    95  	shr	v0
    96  	jnc	L(divide_strip_y)
    97  	adc	v0, v0
    98  
    99  	cmp	$1, n
   100  	jnz	L(reduce_nby1)
   101  
   102  C Both U and V are single limbs, reduce with bmod if u0 >> v0.
   103  	mov	(up), %r8
   104  	mov	%r8, %rax
   105  	shr	$BMOD_THRES_LOG2, %r8
   106  	cmp	%r8, v0
   107  	ja	L(noreduce)
   108  	push	v0
   109  	sub	$STACK_ALLOC, %rsp	C maintain ABI required rsp alignment
   110  
   111  L(bmod):
   112  IFDOS(`	mov	%rdx, %r8	')
   113  IFDOS(`	mov	%rsi, %rdx	')
   114  IFDOS(`	mov	%rdi, %rcx	')
   115  	ASSERT(nz, `test $15, %rsp')
   116  	CALL(	mpn_modexact_1_odd)
   117  
   118  L(reduced):
   119  	add	$STACK_ALLOC, %rsp
   120  	pop	%rdx
   121  
   122  L(noreduce):
   123  	LEA(	ctz_table, %rsi)
   124  	test	%rax, %rax
   125  	mov	%rax, %rcx
   126  	jnz	L(mid)
   127  	jmp	L(end)
   128  
   129  L(reduce_nby1):
   130  	push	v0
   131  	sub	$STACK_ALLOC, %rsp	C maintain ABI required rsp alignment
   132  
   133  	cmp	$BMOD_1_TO_MOD_1_THRESHOLD, n
   134  	jl	L(bmod)
   135  IFDOS(`	mov	%rdx, %r8	')
   136  IFDOS(`	mov	%rsi, %rdx	')
   137  IFDOS(`	mov	%rdi, %rcx	')
   138  	ASSERT(nz, `test $15, %rsp')
   139  	CALL(	mpn_mod_1)
   140  	jmp	L(reduced)
   141  
   142  	ALIGN(16)			C               K8    BC    P4    NHM   SBR
   143  L(top):	cmovc	%rcx, %rax		C if x-y < 0	0
   144  	cmovc	%rdi, %rdx		C use x,y-x	0
   145  L(mid):	and	$MASK, R32(%rcx)	C		0
   146  	movzbl	(%rsi,%rcx), R32(%rcx)	C		1
   147  	jz	L(shift_alot)		C		1
   148  	shr	R8(%rcx), %rax		C		3
   149  	mov	%rax, %rdi		C		4
   150  	mov	%rdx, %rcx		C		3
   151  	sub	%rax, %rcx		C		4
   152  	sub	%rdx, %rax		C		4
   153  	jnz	L(top)			C		5
   154  
   155  L(end):	pop	%rcx
   156  	mov	%rdx, %rax
   157  	shl	R8(%rcx), %rax
   158  	FUNC_EXIT()
   159  	ret
   160  
   161  L(shift_alot):
   162  	shr	$MAXSHIFT, %rax
   163  	mov	%rax, %rcx
   164  	jmp	L(mid)
   165  EPILOGUE()