github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/mod_1_1.asm

github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/mod_1_1.asm (about)

     1  dnl  AMD64 mpn_mod_1_1p
     2  
     3  dnl  Contributed to the GNU project by Torbjörn Granlund and Niels Möller.
     4  
     5  dnl  Copyright 2009-2012, 2014 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  C	     cycles/limb
    36  C AMD K8,K9	 6
    37  C AMD K10	 6
    38  C Intel P4	26
    39  C Intel core2	12.5
    40  C Intel NHM	11.3
    41  C Intel SBR	 8.4	(slowdown, old code took 8.0)
    42  C Intel atom	26
    43  C VIA nano	13
    44  
    45  define(`B2mb',   `%r10')
    46  define(`B2modb', `%r11')
    47  define(`ap',     `%rdi')
    48  define(`n',      `%rsi')
    49  define(`pre',    `%r8')
    50  define(`b',      `%rbx')
    51  
    52  define(`r0',     `%rbp') C r1 kept in %rax
    53  define(`r2',	 `%rcx')  C kept negated. Also used as shift count
    54  define(`t0',     `%r9')
    55  
    56  C mp_limb_t
    57  C mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t bmodb[4])
    58  C                       %rdi         %rsi         %rdx                %rcx
    59  C The pre array contains bi, cnt, B1modb, B2modb
    60  C Note: This implementation needs B1modb only when cnt > 0
    61  
    62  C The iteration is almost as follows,
    63  C
    64  C   r_2 B^3 + r_1 B^2 + r_0 B + u = r_1 B2modb + (r_0 + r_2 B2mod) B + u
    65  C
    66  C where r2 is a single bit represented as a mask. But to make sure that the
    67  C result fits in two limbs and a bit, carry from the addition
    68  C
    69  C   r_0 + r_2 B2mod
    70  C
    71  C is handled specially. On carry, we subtract b to cancel the carry,
    72  C and we use instead the value
    73  C
    74  C   r_0 + B2mb (mod B)
    75  C
    76  C This addition can be issued early since it doesn't depend on r2, and it is
    77  C the source of the cmov in the loop.
    78  C
    79  C We have the invariant that r_2 B^2 + r_1 B + r_0 < B^2 + B b
    80  
    81  ABI_SUPPORT(DOS64)
    82  ABI_SUPPORT(STD64)
    83  
    84  ASM_START()
    85  	TEXT
    86  	ALIGN(16)
    87  PROLOGUE(mpn_mod_1_1p)
    88  	FUNC_ENTRY(4)
    89  	push	%rbp
    90  	push	%rbx
    91  	mov	%rdx, b
    92  	mov	%rcx, pre
    93  
    94  	mov	-8(ap, n, 8), %rax
    95  	cmp	$3, n
    96  	jnc	L(first)
    97  	mov	-16(ap, n, 8), r0
    98  	jmp	L(reduce_two)
    99  
   100  L(first):
   101  	C First iteration, no r2
   102  	mov	24(pre), B2modb
   103  	mul	B2modb
   104  	mov	-24(ap, n, 8), r0
   105  	add	%rax, r0
   106  	mov	-16(ap, n, 8), %rax
   107  	adc	%rdx, %rax
   108  	sbb	r2, r2
   109  	sub	$4, n
   110  	jc	L(reduce_three)
   111  
   112  	mov	B2modb, B2mb
   113  	sub	b, B2mb
   114  
   115  	ALIGN(16)
   116  L(top):	and	B2modb, r2
   117  	lea	(B2mb, r0), t0
   118  	mul	B2modb
   119  	add	r0, r2
   120  	mov	(ap, n, 8), r0
   121  	cmovc	t0, r2
   122  	add	%rax, r0
   123  	mov	r2, %rax
   124  	adc	%rdx, %rax
   125  	sbb	r2, r2
   126  	sub	$1, n
   127  	jnc	L(top)
   128  
   129  L(reduce_three):
   130  	C Eliminate r2
   131  	and	b, r2
   132  	sub	r2, %rax
   133  
   134  L(reduce_two):
   135  	mov	8(pre), R32(%rcx)
   136  	test	R32(%rcx), R32(%rcx)
   137  	jz	L(normalized)
   138  
   139  	C Unnormalized, use B1modb to reduce to size < B (b+1)
   140  	mulq	16(pre)
   141  	xor	t0, t0
   142  	add	%rax, r0
   143  	adc	%rdx, t0
   144  	mov	t0, %rax
   145  
   146  	C Left-shift to normalize
   147  ifdef(`SHLD_SLOW',`
   148  	shl	R8(%rcx), %rax
   149  	mov	r0, t0
   150  	neg	R32(%rcx)
   151  	shr	R8(%rcx), t0
   152  	or	t0, %rax
   153  	neg	R32(%rcx)
   154  ',`
   155  	shld	R8(%rcx), r0, %rax
   156  ')
   157  	shl	R8(%rcx), r0
   158  	jmp	L(udiv)
   159  
   160  L(normalized):
   161  	mov	%rax, t0
   162  	sub	b, t0
   163  	cmovnc	t0, %rax
   164  
   165  L(udiv):
   166  	lea	1(%rax), t0
   167  	mulq	(pre)
   168  	add	r0, %rax
   169  	adc	t0, %rdx
   170  	imul	b, %rdx
   171  	sub	%rdx, r0
   172  	cmp	r0, %rax
   173  	lea	(b, r0), %rax
   174  	cmovnc	r0, %rax
   175  	cmp	b, %rax
   176  	jnc	L(fix)
   177  L(ok):	shr	R8(%rcx), %rax
   178  
   179  	pop	%rbx
   180  	pop	%rbp
   181  	FUNC_EXIT()
   182  	ret
   183  L(fix):	sub	b, %rax
   184  	jmp	L(ok)
   185  EPILOGUE()
   186  
   187  	ALIGN(16)
   188  PROLOGUE(mpn_mod_1_1p_cps)
   189  	FUNC_ENTRY(2)
   190  	push	%rbp
   191  	bsr	%rsi, %rcx
   192  	push	%rbx
   193  	mov	%rdi, %rbx
   194  	push	%r12
   195  	xor	$63, R32(%rcx)
   196  	mov	%rsi, %r12
   197  	mov	R32(%rcx), R32(%rbp)
   198  	sal	R8(%rcx), %r12
   199  IFSTD(`	mov	%r12, %rdi	')	C pass parameter
   200  IFDOS(`	mov	%r12, %rcx	')	C pass parameter
   201  	ASSERT(nz, `test $15, %rsp')
   202  	CALL(	mpn_invert_limb)
   203  	neg	%r12
   204  	mov	%r12, %r8
   205  	mov	%rax, (%rbx)		C store bi
   206  	mov	%rbp, 8(%rbx)		C store cnt
   207  	imul	%rax, %r12
   208  	mov	%r12, 24(%rbx)		C store B2modb
   209  	mov	R32(%rbp), R32(%rcx)
   210  	test	R32(%rcx), R32(%rcx)
   211  	jz	L(z)
   212  
   213  	mov	$1, R32(%rdx)
   214  ifdef(`SHLD_SLOW',`
   215  	C Destroys %rax, unlike shld. Otherwise, we could do B1modb
   216  	C before B2modb, and get rid of the move %r12, %r8 above.
   217  
   218  	shl	R8(%rcx), %rdx
   219  	neg	R32(%rcx)
   220  	shr	R8(%rcx), %rax
   221  	or	%rax, %rdx
   222  	neg	R32(%rcx)
   223  ',`
   224  	shld	R8(%rcx), %rax, %rdx
   225  ')
   226  	imul	%rdx, %r8
   227  	shr	R8(%rcx), %r8
   228  	mov	%r8, 16(%rbx)		C store B1modb
   229  L(z):
   230  	pop	%r12
   231  	pop	%rbx
   232  	pop	%rbp
   233  	FUNC_EXIT()
   234  	ret
   235  EPILOGUE()
   236  ASM_END()