github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/pentium4/aorslshC_n.asm (about)

     1  dnl  AMD64 mpn_addlshC_n, mpn_sublshC_n -- rp[] = up[] +- (vp[] << C), where
     2  dnl  C is 1, 2, 3.  Optimized for Pentium 4.
     3  
     4  dnl  Contributed to the GNU project by Torbjorn Granlund.
     5  
     6  dnl  Copyright 2008, 2010-2012 Free Software Foundation, Inc.
     7  
     8  dnl  This file is part of the GNU MP Library.
     9  dnl
    10  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    11  dnl  it under the terms of either:
    12  dnl
    13  dnl    * the GNU Lesser General Public License as published by the Free
    14  dnl      Software Foundation; either version 3 of the License, or (at your
    15  dnl      option) any later version.
    16  dnl
    17  dnl  or
    18  dnl
    19  dnl    * the GNU General Public License as published by the Free Software
    20  dnl      Foundation; either version 2 of the License, or (at your option) any
    21  dnl      later version.
    22  dnl
    23  dnl  or both in parallel, as here.
    24  dnl
    25  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    26  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    27  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    28  dnl  for more details.
    29  dnl
    30  dnl  You should have received copies of the GNU General Public License and the
    31  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    32  dnl  see https://www.gnu.org/licenses/.
    33  
    34  C	     cycles/limb
    35  C AMD K8,K9	 3.8
    36  C AMD K10	 3.8
    37  C Intel P4	 5.8
    38  C Intel core2	 4.75
    39  C Intel corei	 4.75
    40  C Intel atom	 ?
    41  C VIA nano	 4.75
    42  
    43  
    44  C INPUT PARAMETERS
    45  define(`rp',`%rdi')
    46  define(`up',`%rsi')
    47  define(`vp',`%rdx')
    48  define(`n', `%rcx')
    49  
    50  define(M, eval(m4_lshift(1,LSH)))
    51  
    52  ABI_SUPPORT(DOS64)
    53  ABI_SUPPORT(STD64)
    54  
    55  ASM_START()
    56  	TEXT
    57  	ALIGN(16)
    58  PROLOGUE(func)
    59  	FUNC_ENTRY(4)
    60  	push	%rbx
    61  	push	%r12
    62  	push	%rbp
    63  
    64  	mov	(vp), %r9
    65  	shl	$LSH, %r9
    66  	mov	4(vp), R32(%rbp)
    67  
    68  	xor	R32(%rbx), R32(%rbx)
    69  
    70  	mov	R32(n), R32(%rax)
    71  	and	$3, R32(%rax)
    72  	jne	L(n00)		C n = 0, 4, 8, ...
    73  
    74  	mov	(up), %r8
    75  	mov	8(up), %r10
    76  	shr	$RSH, R32(%rbp)
    77  	ADDSUB	%r9, %r8
    78  	mov	8(vp), %r9
    79  	lea	(%rbp,%r9,M), %r9
    80  	setc	R8(%rax)
    81  	mov	12(vp), R32(%rbp)
    82  	lea	-16(rp), rp
    83  	jmp	L(L00)
    84  
    85  L(n00):	cmp	$2, R32(%rax)
    86  	jnc	L(n01)		C n = 1, 5, 9, ...
    87  	mov	(up), %r11
    88  	lea	-8(rp), rp
    89  	shr	$RSH, R32(%rbp)
    90  	ADDSUB	%r9, %r11
    91  	setc	R8(%rbx)
    92  	dec	n
    93  	jz	L(1)		C jump for n = 1
    94  	mov	8(up), %r8
    95  	mov	8(vp), %r9
    96  	lea	(%rbp,%r9,M), %r9
    97  	mov	12(vp), R32(%rbp)
    98  	lea	8(up), up
    99  	lea	8(vp), vp
   100  	jmp	L(L01)
   101  
   102  L(n01):	jne	L(n10)		C n = 2, 6, 10, ...
   103  	mov	(up), %r12
   104  	mov	8(up), %r11
   105  	shr	$RSH, R32(%rbp)
   106  	ADDSUB	%r9, %r12
   107  	mov	8(vp), %r9
   108  	lea	(%rbp,%r9,M), %r9
   109  	setc	R8(%rax)
   110  	mov	12(vp), R32(%rbp)
   111  	lea	16(up), up
   112  	lea	16(vp), vp
   113  	jmp	L(L10)
   114  
   115  L(n10):	mov	(up), %r10
   116  	mov	8(up), %r12
   117  	shr	$RSH, R32(%rbp)
   118  	ADDSUB	%r9, %r10
   119  	mov	8(vp), %r9
   120  	lea	(%rbp,%r9,M), %r9
   121  	setc	R8(%rbx)
   122  	mov	12(vp), R32(%rbp)
   123  	lea	-24(rp), rp
   124  	lea	-8(up), up
   125  	lea	-8(vp), vp
   126  	jmp	L(L11)
   127  
   128  L(c0):	mov	$1, R8(%rbx)
   129  	jmp	L(rc0)
   130  L(c1):	mov	$1, R8(%rax)
   131  	jmp	L(rc1)
   132  L(c2):	mov	$1, R8(%rbx)
   133  	jmp	L(rc2)
   134  
   135  	ALIGN(16)
   136  L(top):	mov	(up), %r8	C not on critical path
   137  	shr	$RSH, R32(%rbp)
   138  	ADDSUB	%r9, %r11	C not on critical path
   139  	mov	(vp), %r9
   140  	lea	(%rbp,%r9,M), %r9
   141  	setc	R8(%rbx)	C save carry out
   142  	mov	4(vp), R32(%rbp)
   143  	mov	%r12, (rp)
   144  	ADDSUB	%rax, %r11	C apply previous carry out
   145  	jc	L(c0)		C jump if ripple
   146  L(rc0):
   147  L(L01):	mov	8(up), %r10
   148  	shr	$RSH, R32(%rbp)
   149  	ADDSUB	%r9, %r8
   150  	mov	8(vp), %r9
   151  	lea	(%rbp,%r9,M), %r9
   152  	setc	R8(%rax)
   153  	mov	12(vp), R32(%rbp)
   154  	mov	%r11, 8(rp)
   155  	ADDSUB	%rbx, %r8
   156  	jc	L(c1)
   157  L(rc1):
   158  L(L00):	mov	16(up), %r12
   159  	shr	$RSH, R32(%rbp)
   160  	ADDSUB	%r9, %r10
   161  	mov	16(vp), %r9
   162  	lea	(%rbp,%r9,M), %r9
   163  	setc	R8(%rbx)
   164  	mov	20(vp), R32(%rbp)
   165  	mov	%r8, 16(rp)
   166  	ADDSUB	%rax, %r10
   167  	jc	L(c2)
   168  L(rc2):
   169  L(L11):	mov	24(up), %r11
   170  	shr	$RSH, R32(%rbp)
   171  	ADDSUB	%r9, %r12
   172  	mov	24(vp), %r9
   173  	lea	(%rbp,%r9,M), %r9
   174  	lea	32(up), up
   175  	lea	32(vp), vp
   176  	setc	R8(%rax)
   177  	mov	-4(vp), R32(%rbp)
   178  	mov	%r10, 24(rp)
   179  	ADDSUB	%rbx, %r12
   180  	jc	L(c3)
   181  L(rc3):	lea	32(rp), rp
   182  L(L10):	sub	$4, n
   183  	ja	L(top)
   184  
   185  L(end):
   186  	shr	$RSH, R32(%rbp)
   187  	ADDSUB	%r9, %r11
   188  	setc	R8(%rbx)
   189  	mov	%r12, (rp)
   190  	ADDSUB	%rax, %r11
   191  	jnc	L(1)
   192  	mov	$1, R8(%rbx)
   193  L(1):	mov	%r11, 8(rp)
   194  	lea	(%rbx,%rbp), R32(%rax)
   195  	pop	%rbp
   196  	pop	%r12
   197  	pop	%rbx
   198  	FUNC_EXIT()
   199  	ret
   200  L(c3):	mov	$1, R8(%rax)
   201  	jmp	L(rc3)
   202  EPILOGUE()
   203  ASM_END()