github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/mod_1_4.asm (about)

     1  dnl  AMD64 mpn_mod_1s_4p
     2  
     3  dnl  Contributed to the GNU project by Torbjorn Granlund.
     4  
     5  dnl  Copyright 2009-2012, 2014 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  C	     cycles/limb
    36  C AMD K8,K9	 3
    37  C AMD K10	 3
    38  C Intel P4	15.5
    39  C Intel core2	 5
    40  C Intel corei	 4
    41  C Intel atom	23
    42  C VIA nano	 4.75
    43  
    44  ABI_SUPPORT(DOS64)
    45  ABI_SUPPORT(STD64)
    46  
    47  ASM_START()
    48  	TEXT
    49  	ALIGN(16)
    50  PROLOGUE(mpn_mod_1s_4p)
    51  	FUNC_ENTRY(4)
    52  	push	%r15
    53  	push	%r14
    54  	push	%r13
    55  	push	%r12
    56  	push	%rbp
    57  	push	%rbx
    58  
    59  	mov	%rdx, %r15
    60  	mov	%rcx, %r14
    61  	mov	16(%rcx), %r11		C B1modb
    62  	mov	24(%rcx), %rbx		C B2modb
    63  	mov	32(%rcx), %rbp		C B3modb
    64  	mov	40(%rcx), %r13		C B4modb
    65  	mov	48(%rcx), %r12		C B5modb
    66  	xor	R32(%r8), R32(%r8)
    67  	mov	R32(%rsi), R32(%rdx)
    68  	and	$3, R32(%rdx)
    69  	je	L(b0)
    70  	cmp	$2, R32(%rdx)
    71  	jc	L(b1)
    72  	je	L(b2)
    73  
    74  L(b3):	lea	-24(%rdi,%rsi,8), %rdi
    75  	mov	8(%rdi), %rax
    76  	mul	%r11
    77  	mov	(%rdi), %r9
    78  	add	%rax, %r9
    79  	adc	%rdx, %r8
    80  	mov	16(%rdi), %rax
    81  	mul	%rbx
    82  	jmp	L(m0)
    83  
    84  	ALIGN(8)
    85  L(b0):	lea	-32(%rdi,%rsi,8), %rdi
    86  	mov	8(%rdi), %rax
    87  	mul	%r11
    88  	mov	(%rdi), %r9
    89  	add	%rax, %r9
    90  	adc	%rdx, %r8
    91  	mov	16(%rdi), %rax
    92  	mul	%rbx
    93  	add	%rax, %r9
    94  	adc	%rdx, %r8
    95  	mov	24(%rdi), %rax
    96  	mul	%rbp
    97  	jmp	L(m0)
    98  
    99  	ALIGN(8)
   100  L(b1):	lea	-8(%rdi,%rsi,8), %rdi
   101  	mov	(%rdi), %r9
   102  	jmp	L(m1)
   103  
   104  	ALIGN(8)
   105  L(b2):	lea	-16(%rdi,%rsi,8), %rdi
   106  	mov	8(%rdi), %r8
   107  	mov	(%rdi), %r9
   108  	jmp	L(m1)
   109  
   110  	ALIGN(16)
   111  L(top):	mov	-24(%rdi), %rax
   112  	mov	-32(%rdi), %r10
   113  	mul	%r11			C up[1] * B1modb
   114  	add	%rax, %r10
   115  	mov	-16(%rdi), %rax
   116  	mov	$0, R32(%rcx)
   117  	adc	%rdx, %rcx
   118  	mul	%rbx			C up[2] * B2modb
   119  	add	%rax, %r10
   120  	mov	-8(%rdi), %rax
   121  	adc	%rdx, %rcx
   122  	sub	$32, %rdi
   123  	mul	%rbp			C up[3] * B3modb
   124  	add	%rax, %r10
   125  	mov	%r13, %rax
   126  	adc	%rdx, %rcx
   127  	mul	%r9			C rl * B4modb
   128  	add	%rax, %r10
   129  	mov	%r12, %rax
   130  	adc	%rdx, %rcx
   131  	mul	%r8			C rh * B5modb
   132  	mov	%r10, %r9
   133  	mov	%rcx, %r8
   134  L(m0):	add	%rax, %r9
   135  	adc	%rdx, %r8
   136  L(m1):	sub	$4, %rsi
   137  	ja	L(top)
   138  
   139  L(end):	mov	8(%r14), R32(%rsi)
   140  	mov	%r8, %rax
   141  	mul	%r11
   142  	mov	%rax, %r8
   143  	add	%r9, %r8
   144  	adc	$0, %rdx
   145  	xor	R32(%rcx), R32(%rcx)
   146  	sub	R32(%rsi), R32(%rcx)
   147  	mov	%r8, %rdi
   148  	shr	R8(%rcx), %rdi
   149  	mov	R32(%rsi), R32(%rcx)
   150  	sal	R8(%rcx), %rdx
   151  	or	%rdx, %rdi
   152  	mov	%rdi, %rax
   153  	mulq	(%r14)
   154  	mov	%r15, %rbx
   155  	mov	%rax, %r9
   156  	sal	R8(%rcx), %r8
   157  	inc	%rdi
   158  	add	%r8, %r9
   159  	adc	%rdi, %rdx
   160  	imul	%rbx, %rdx
   161  	sub	%rdx, %r8
   162  	lea	(%r8,%rbx), %rax
   163  	cmp	%r8, %r9
   164  	cmovc	%rax, %r8
   165  	mov	%r8, %rax
   166  	sub	%rbx, %rax
   167  	cmovc	%r8, %rax
   168  	shr	R8(%rcx), %rax
   169  	pop	%rbx
   170  	pop	%rbp
   171  	pop	%r12
   172  	pop	%r13
   173  	pop	%r14
   174  	pop	%r15
   175  	FUNC_EXIT()
   176  	ret
   177  EPILOGUE()
   178  
   179  	ALIGN(16)
   180  PROLOGUE(mpn_mod_1s_4p_cps)
   181  	FUNC_ENTRY(2)
   182  	push	%rbp
   183  	bsr	%rsi, %rcx
   184  	push	%rbx
   185  	mov	%rdi, %rbx
   186  	push	%r12
   187  	xor	$63, R32(%rcx)
   188  	mov	%rsi, %r12
   189  	mov	R32(%rcx), R32(%rbp)	C preserve cnt over call
   190  	sal	R8(%rcx), %r12		C b << cnt
   191  IFSTD(`	mov	%r12, %rdi	')	C pass parameter
   192  IFDOS(`	mov	%r12, %rcx	')	C pass parameter
   193  	ASSERT(nz, `test $15, %rsp')
   194  	CALL(	mpn_invert_limb)
   195  	mov	%r12, %r8
   196  	mov	%rax, %r11
   197  	mov	%rax, (%rbx)		C store bi
   198  	mov	%rbp, 8(%rbx)		C store cnt
   199  	neg	%r8
   200  	mov	R32(%rbp), R32(%rcx)
   201  	mov	$1, R32(%rsi)
   202  ifdef(`SHLD_SLOW',`
   203  	shl	R8(%rcx), %rsi
   204  	neg	R32(%rcx)
   205  	mov	%rax, %rbp
   206  	shr	R8(%rcx), %rax
   207  	or	%rax, %rsi
   208  	mov	%rbp, %rax
   209  	neg	R32(%rcx)
   210  ',`
   211  	shld	R8(%rcx), %rax, %rsi	C FIXME: Slow on Atom and Nano
   212  ')
   213  	imul	%r8, %rsi
   214  	mul	%rsi
   215  
   216  	add	%rsi, %rdx
   217  	shr	R8(%rcx), %rsi
   218  	mov	%rsi, 16(%rbx)		C store B1modb
   219  
   220  	not	%rdx
   221  	imul	%r12, %rdx
   222  	lea	(%rdx,%r12), %rsi
   223  	cmp	%rdx, %rax
   224  	cmovnc	%rdx, %rsi
   225  	mov	%r11, %rax
   226  	mul	%rsi
   227  
   228  	add	%rsi, %rdx
   229  	shr	R8(%rcx), %rsi
   230  	mov	%rsi, 24(%rbx)		C store B2modb
   231  
   232  	not	%rdx
   233  	imul	%r12, %rdx
   234  	lea	(%rdx,%r12), %rsi
   235  	cmp	%rdx, %rax
   236  	cmovnc	%rdx, %rsi
   237  	mov	%r11, %rax
   238  	mul	%rsi
   239  
   240  	add	%rsi, %rdx
   241  	shr	R8(%rcx), %rsi
   242  	mov	%rsi, 32(%rbx)		C store B3modb
   243  
   244  	not	%rdx
   245  	imul	%r12, %rdx
   246  	lea	(%rdx,%r12), %rsi
   247  	cmp	%rdx, %rax
   248  	cmovnc	%rdx, %rsi
   249  	mov	%r11, %rax
   250  	mul	%rsi
   251  
   252  	add	%rsi, %rdx
   253  	shr	R8(%rcx), %rsi
   254  	mov	%rsi, 40(%rbx)		C store B4modb
   255  
   256  	not	%rdx
   257  	imul	%r12, %rdx
   258  	add	%rdx, %r12
   259  	cmp	%rdx, %rax
   260  	cmovnc	%rdx, %r12
   261  
   262  	shr	R8(%rcx), %r12
   263  	mov	%r12, 48(%rbx)		C store B5modb
   264  
   265  	pop	%r12
   266  	pop	%rbx
   267  	pop	%rbp
   268  	FUNC_EXIT()
   269  	ret
   270  EPILOGUE()