github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/mod_1_2.asm

github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/mod_1_2.asm (about)

     1  dnl  AMD64 mpn_mod_1s_2p
     2  
     3  dnl  Contributed to the GNU project by Torbjorn Granlund.
     4  
     5  dnl  Copyright 2009-2012, 2014 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  C	     cycles/limb
    36  C AMD K8,K9	 4
    37  C AMD K10	 4
    38  C Intel P4	19
    39  C Intel core2	 8
    40  C Intel NHM	 6.5
    41  C Intel SBR	 4.5
    42  C Intel atom	28
    43  C VIA nano	 8
    44  
    45  ABI_SUPPORT(DOS64)
    46  ABI_SUPPORT(STD64)
    47  
    48  ASM_START()
    49  	TEXT
    50  	ALIGN(16)
    51  PROLOGUE(mpn_mod_1s_2p)
    52  	FUNC_ENTRY(4)
    53  	push	%r14
    54  	test	$1, R8(%rsi)
    55  	mov	%rdx, %r14
    56  	push	%r13
    57  	mov	%rcx, %r13
    58  	push	%r12
    59  	push	%rbp
    60  	push	%rbx
    61  	mov	16(%rcx), %r10
    62  	mov	24(%rcx), %rbx
    63  	mov	32(%rcx), %rbp
    64  	je	L(b0)
    65  	dec	%rsi
    66  	je	L(one)
    67  	mov	-8(%rdi,%rsi,8), %rax
    68  	mul	%r10
    69  	mov	%rax, %r9
    70  	mov	%rdx, %r8
    71  	mov	(%rdi,%rsi,8), %rax
    72  	add	-16(%rdi,%rsi,8), %r9
    73  	adc	$0, %r8
    74  	mul	%rbx
    75  	add	%rax, %r9
    76  	adc	%rdx, %r8
    77  	jmp	L(11)
    78  
    79  L(b0):	mov	-8(%rdi,%rsi,8), %r8
    80  	mov	-16(%rdi,%rsi,8), %r9
    81  
    82  L(11):	sub	$4, %rsi
    83  	jb	L(ed2)
    84  	lea	40(%rdi,%rsi,8), %rdi
    85  	mov	-40(%rdi), %r11
    86  	mov	-32(%rdi), %rax
    87  	jmp	L(m0)
    88  
    89  	ALIGN(16)
    90  L(top):	mov	-24(%rdi), %r9
    91  	add	%rax, %r11
    92  	mov	-16(%rdi), %rax
    93  	adc	%rdx, %r12
    94  	mul	%r10
    95  	add	%rax, %r9
    96  	mov	%r11, %rax
    97  	mov	%rdx, %r8
    98  	adc	$0, %r8
    99  	mul	%rbx
   100  	add	%rax, %r9
   101  	mov	%r12, %rax
   102  	adc	%rdx, %r8
   103  	mul	%rbp
   104  	sub	$2, %rsi
   105  	jb	L(ed1)
   106  	mov	-40(%rdi), %r11
   107  	add	%rax, %r9
   108  	mov	-32(%rdi), %rax
   109  	adc	%rdx, %r8
   110  L(m0):	mul	%r10
   111  	add	%rax, %r11
   112  	mov	%r9, %rax
   113  	mov	%rdx, %r12
   114  	adc	$0, %r12
   115  	mul	%rbx
   116  	add	%rax, %r11
   117  	lea	-32(%rdi), %rdi		C ap -= 4
   118  	mov	%r8, %rax
   119  	adc	%rdx, %r12
   120  	mul	%rbp
   121  	sub	$2, %rsi
   122  	jae	L(top)
   123  
   124  L(ed0):	mov	%r11, %r9
   125  	mov	%r12, %r8
   126  L(ed1):	add	%rax, %r9
   127  	adc	%rdx, %r8
   128  L(ed2):	mov	8(%r13), R32(%rdi)		C cnt
   129  	mov	%r8, %rax
   130  	mov	%r9, %r8
   131  	mul	%r10
   132  	add	%rax, %r8
   133  	adc	$0, %rdx
   134  L(1):	xor	R32(%rcx), R32(%rcx)
   135  	mov	%r8, %r9
   136  	sub	R32(%rdi), R32(%rcx)
   137  	shr	R8(%rcx), %r9
   138  	mov	R32(%rdi), R32(%rcx)
   139  	sal	R8(%rcx), %rdx
   140  	or	%rdx, %r9
   141  	sal	R8(%rcx), %r8
   142  	mov	%r9, %rax
   143  	mulq	(%r13)
   144  	mov	%rax, %rsi
   145  	inc	%r9
   146  	add	%r8, %rsi
   147  	adc	%r9, %rdx
   148  	imul	%r14, %rdx
   149  	sub	%rdx, %r8
   150  	lea	(%r8,%r14), %rax
   151  	cmp	%r8, %rsi
   152  	cmovc	%rax, %r8
   153  	mov	%r8, %rax
   154  	sub	%r14, %rax
   155  	cmovc	%r8, %rax
   156  	mov	R32(%rdi), R32(%rcx)
   157  	shr	R8(%rcx), %rax
   158  	pop	%rbx
   159  	pop	%rbp
   160  	pop	%r12
   161  	pop	%r13
   162  	pop	%r14
   163  	FUNC_EXIT()
   164  	ret
   165  L(one):
   166  	mov	(%rdi), %r8
   167  	mov	8(%rcx), R32(%rdi)
   168  	xor	%rdx, %rdx
   169  	jmp	L(1)
   170  EPILOGUE()
   171  
   172  	ALIGN(16)
   173  PROLOGUE(mpn_mod_1s_2p_cps)
   174  	FUNC_ENTRY(2)
   175  	push	%rbp
   176  	bsr	%rsi, %rcx
   177  	push	%rbx
   178  	mov	%rdi, %rbx
   179  	push	%r12
   180  	xor	$63, R32(%rcx)
   181  	mov	%rsi, %r12
   182  	mov	R32(%rcx), R32(%rbp)	C preserve cnt over call
   183  	sal	R8(%rcx), %r12		C b << cnt
   184  IFSTD(`	mov	%r12, %rdi	')	C pass parameter
   185  IFDOS(`	mov	%r12, %rcx	')	C pass parameter
   186  	ASSERT(nz, `test $15, %rsp')
   187  	CALL(	mpn_invert_limb)
   188  	mov	%r12, %r8
   189  	mov	%rax, %r11
   190  	mov	%rax, (%rbx)		C store bi
   191  	mov	%rbp, 8(%rbx)		C store cnt
   192  	neg	%r8
   193  	mov	R32(%rbp), R32(%rcx)
   194  	mov	$1, R32(%rsi)
   195  ifdef(`SHLD_SLOW',`
   196  	shl	R8(%rcx), %rsi
   197  	neg	R32(%rcx)
   198  	mov	%rax, %rbp
   199  	shr	R8(%rcx), %rax
   200  	or	%rax, %rsi
   201  	mov	%rbp, %rax
   202  	neg	R32(%rcx)
   203  ',`
   204  	shld	R8(%rcx), %rax, %rsi	C FIXME: Slow on Atom and Nano
   205  ')
   206  	imul	%r8, %rsi
   207  	mul	%rsi
   208  
   209  	add	%rsi, %rdx
   210  	shr	R8(%rcx), %rsi
   211  	mov	%rsi, 16(%rbx)		C store B1modb
   212  
   213  	not	%rdx
   214  	imul	%r12, %rdx
   215  	lea	(%rdx,%r12), %rsi
   216  	cmp	%rdx, %rax
   217  	cmovnc	%rdx, %rsi
   218  	mov	%r11, %rax
   219  	mul	%rsi
   220  
   221  	add	%rsi, %rdx
   222  	shr	R8(%rcx), %rsi
   223  	mov	%rsi, 24(%rbx)		C store B2modb
   224  
   225  	not	%rdx
   226  	imul	%r12, %rdx
   227  	add	%rdx, %r12
   228  	cmp	%rdx, %rax
   229  	cmovnc	%rdx, %r12
   230  
   231  	shr	R8(%rcx), %r12
   232  	mov	%r12, 32(%rbx)		C store B3modb
   233  
   234  	pop	%r12
   235  	pop	%rbx
   236  	pop	%rbp
   237  	FUNC_EXIT()
   238  	ret
   239  EPILOGUE()