github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/aorrlsh1_n.asm (about)

     1  dnl  AMD64 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
     2  dnl  AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[]
     3  
     4  dnl  Copyright 2003, 2005-2009, 2011, 2012 Free Software Foundation, Inc.
     5  
     6  dnl  This file is part of the GNU MP Library.
     7  dnl
     8  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     9  dnl  it under the terms of either:
    10  dnl
    11  dnl    * the GNU Lesser General Public License as published by the Free
    12  dnl      Software Foundation; either version 3 of the License, or (at your
    13  dnl      option) any later version.
    14  dnl
    15  dnl  or
    16  dnl
    17  dnl    * the GNU General Public License as published by the Free Software
    18  dnl      Foundation; either version 2 of the License, or (at your option) any
    19  dnl      later version.
    20  dnl
    21  dnl  or both in parallel, as here.
    22  dnl
    23  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    24  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    25  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    26  dnl  for more details.
    27  dnl
    28  dnl  You should have received copies of the GNU General Public License and the
    29  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    30  dnl  see https://www.gnu.org/licenses/.
    31  
    32  include(`../config.m4')
    33  
    34  
    35  C	     cycles/limb
    36  C AMD K8,K9	 2
    37  C AMD K10	 2
    38  C AMD bd1	 ?
    39  C AMD bobcat	 ?
    40  C Intel P4	 13
    41  C Intel core2	 3.45
    42  C Intel NHM	 ?
    43  C Intel SBR	 ?
    44  C Intel atom	 ?
    45  C VIA nano	 ?
    46  
    47  
    48  C Sometimes speed degenerates, supposedly related to that some operand
    49  C alignments cause cache conflicts.
    50  
    51  C The speed is limited by decoding/issue bandwidth.  There are 22 instructions
    52  C in the loop, which corresponds to ceil(22/3)/4 = 1.83 c/l.
    53  
    54  C INPUT PARAMETERS
    55  define(`rp',`%rdi')
    56  define(`up',`%rsi')
    57  define(`vp',`%rdx')
    58  define(`n', `%rcx')
    59  
    60  ifdef(`OPERATION_addlsh1_n', `
    61    define(ADDSUB,	add)
    62    define(ADCSBB,	adc)
    63    define(func,		mpn_addlsh1_n)')
    64  ifdef(`OPERATION_rsblsh1_n', `
    65    define(ADDSUB,	sub)
    66    define(ADCSBB,	sbb)
    67    define(func,		mpn_rsblsh1_n)')
    68  
    69  MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_rsblsh1_n)
    70  
    71  ABI_SUPPORT(DOS64)
    72  ABI_SUPPORT(STD64)
    73  
    74  ASM_START()
    75  	TEXT
    76  	ALIGN(16)
    77  PROLOGUE(func)
    78  	FUNC_ENTRY(4)
    79  	push	%rbp
    80  
    81  	mov	(vp), %r8
    82  	mov	R32(n), R32(%rax)
    83  	lea	(rp,n,8), rp
    84  	lea	(up,n,8), up
    85  	lea	(vp,n,8), vp
    86  	neg	n
    87  	xor	R32(%rbp), R32(%rbp)
    88  	and	$3, R32(%rax)
    89  	je	L(b00)
    90  	cmp	$2, R32(%rax)
    91  	jc	L(b01)
    92  	je	L(b10)
    93  
    94  L(b11):	add	%r8, %r8
    95  	mov	8(vp,n,8), %r9
    96  	adc	%r9, %r9
    97  	mov	16(vp,n,8), %r10
    98  	adc	%r10, %r10
    99  	sbb	R32(%rax), R32(%rax)	C save scy
   100  	ADDSUB	(up,n,8), %r8
   101  	ADCSBB	8(up,n,8), %r9
   102  	mov	%r8, (rp,n,8)
   103  	mov	%r9, 8(rp,n,8)
   104  	ADCSBB	16(up,n,8), %r10
   105  	mov	%r10, 16(rp,n,8)
   106  	sbb	R32(%rbp), R32(%rbp)	C save acy
   107  	add	$3, n
   108  	jmp	L(ent)
   109  
   110  L(b10):	add	%r8, %r8
   111  	mov	8(vp,n,8), %r9
   112  	adc	%r9, %r9
   113  	sbb	R32(%rax), R32(%rax)	C save scy
   114  	ADDSUB	(up,n,8), %r8
   115  	ADCSBB	8(up,n,8), %r9
   116  	mov	%r8, (rp,n,8)
   117  	mov	%r9, 8(rp,n,8)
   118  	sbb	R32(%rbp), R32(%rbp)	C save acy
   119  	add	$2, n
   120  	jmp	L(ent)
   121  
   122  L(b01):	add	%r8, %r8
   123  	sbb	R32(%rax), R32(%rax)	C save scy
   124  	ADDSUB	(up,n,8), %r8
   125  	mov	%r8, (rp,n,8)
   126  	sbb	R32(%rbp), R32(%rbp)	C save acy
   127  	inc	n
   128  L(ent):	jns	L(end)
   129  
   130  	ALIGN(16)
   131  L(top):	add	R32(%rax), R32(%rax)	C restore scy
   132  
   133  	mov	(vp,n,8), %r8
   134  L(b00):	adc	%r8, %r8
   135  	mov	8(vp,n,8), %r9
   136  	adc	%r9, %r9
   137  	mov	16(vp,n,8), %r10
   138  	adc	%r10, %r10
   139  	mov	24(vp,n,8), %r11
   140  	adc	%r11, %r11
   141  
   142  	sbb	R32(%rax), R32(%rax)	C save scy
   143  	add	R32(%rbp), R32(%rbp)	C restore acy
   144  
   145  	ADCSBB	(up,n,8), %r8
   146  	nop				C Hammer speedup!
   147  	ADCSBB	8(up,n,8), %r9
   148  	mov	%r8, (rp,n,8)
   149  	mov	%r9, 8(rp,n,8)
   150  	ADCSBB	16(up,n,8), %r10
   151  	ADCSBB	24(up,n,8), %r11
   152  	mov	%r10, 16(rp,n,8)
   153  	mov	%r11, 24(rp,n,8)
   154  
   155  	sbb	R32(%rbp), R32(%rbp)	C save acy
   156  	add	$4, n
   157  	js	L(top)
   158  
   159  L(end):
   160  ifdef(`OPERATION_addlsh1_n',`
   161  	add	R32(%rbp), R32(%rax)
   162  	neg	R32(%rax)')
   163  ifdef(`OPERATION_rsblsh1_n',`
   164  	sub	R32(%rax), R32(%rbp)
   165  	movslq	R32(%rbp), %rax')
   166  
   167  	pop	%rbp
   168  	FUNC_EXIT()
   169  	ret
   170  EPILOGUE()