github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/k8/aorrlsh_n.asm (about)

     1  dnl  AMD64 mpn_addlsh_n and mpn_rsblsh_n.  R = V2^k +- U.
     2  
     3  dnl  Copyright 2006, 2010-2012 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  C	     cycles/limb
    34  C AMD K8,K9	 2.87	< 3.85 for lshift + add_n
    35  C AMD K10	 2.75	< 3.85 for lshift + add_n
    36  C Intel P4	22	> 7.33 for lshift + add_n
    37  C Intel core2	 4.1	> 3.27 for lshift + add_n
    38  C Intel NHM	 4.4	> 3.75 for lshift + add_n
    39  C Intel SBR	 3.17	< 3.46 for lshift + add_n
    40  C Intel atom	 ?	? 8.75 for lshift + add_n
    41  C VIA nano	 4.7	< 6.25 for lshift + add_n
    42  
    43  C TODO
    44  C  * Can we propagate carry into rdx instead of using a special carry register?
    45  C    That could save enough insns to get to 10 cycles/iteration.
    46  
    47  define(`rp',       `%rdi')
    48  define(`up',       `%rsi')
    49  define(`vp_param', `%rdx')
    50  define(`n_param',  `%rcx')
    51  define(`cnt',      `%r8')
    52  
    53  define(`vp',    `%r12')
    54  define(`n',     `%rbp')
    55  
    56  ifdef(`OPERATION_addlsh_n',`
    57    define(ADDSUB,       `add')
    58    define(ADCSBB,       `adc')
    59    define(func, mpn_addlsh_n)
    60  ')
    61  ifdef(`OPERATION_rsblsh_n',`
    62    define(ADDSUB,       `sub')
    63    define(ADCSBB,       `sbb')
    64    define(func, mpn_rsblsh_n)
    65  ')
    66  
    67  MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n)
    68  
    69  ABI_SUPPORT(DOS64)
    70  ABI_SUPPORT(STD64)
    71  
    72  ASM_START()
    73  	TEXT
    74  	ALIGN(16)
    75  PROLOGUE(func)
    76  	FUNC_ENTRY(4)
    77  IFDOS(`	mov	56(%rsp), %r8d	')
    78  	push	%r12
    79  	push	%rbp
    80  	push	%rbx
    81  
    82  	mov	(vp_param), %rax	C load first V limb early
    83  
    84  	mov	$0, R32(n)
    85  	sub	n_param, n
    86  
    87  	lea	-16(up,n_param,8), up
    88  	lea	-16(rp,n_param,8), rp
    89  	lea	16(vp_param,n_param,8), vp
    90  
    91  	mov	n_param, %r9
    92  
    93  	mov	%r8, %rcx
    94  	mov	$1, R32(%r8)
    95  	shl	R8(%rcx), %r8
    96  
    97  	mul	%r8			C initial multiply
    98  
    99  	and	$3, R32(%r9)
   100  	jz	L(b0)
   101  	cmp	$2, R32(%r9)
   102  	jc	L(b1)
   103  	jz	L(b2)
   104  
   105  L(b3):	mov	%rax, %r11
   106  	ADDSUB	16(up,n,8), %r11
   107  	mov	-8(vp,n,8), %rax
   108  	sbb	R32(%rcx), R32(%rcx)
   109  	mov	%rdx, %rbx
   110  	mul	%r8
   111  	or	%rax, %rbx
   112  	mov	(vp,n,8), %rax
   113  	mov	%rdx, %r9
   114  	mul	%r8
   115  	or	%rax, %r9
   116  	add	$3, n
   117  	jnz	L(lo3)
   118  	jmp	L(cj3)
   119  
   120  L(b2):	mov	%rax, %rbx
   121  	mov	-8(vp,n,8), %rax
   122  	mov	%rdx, %r9
   123  	mul	%r8
   124  	or	%rax, %r9
   125  	add	$2, n
   126  	jz	L(cj2)
   127  	mov	%rdx, %r10
   128  	mov	-16(vp,n,8), %rax
   129  	mul	%r8
   130  	or	%rax, %r10
   131  	xor	R32(%rcx), R32(%rcx)	C clear carry register
   132  	jmp	L(lo2)
   133  
   134  L(b1):	mov	%rax, %r9
   135  	mov	%rdx, %r10
   136  	add	$1, n
   137  	jnz	L(gt1)
   138  	ADDSUB	8(up,n,8), %r9
   139  	jmp	L(cj1)
   140  L(gt1):	mov	-16(vp,n,8), %rax
   141  	mul	%r8
   142  	or	%rax, %r10
   143  	mov	%rdx, %r11
   144  	mov	-8(vp,n,8), %rax
   145  	mul	%r8
   146  	or	%rax, %r11
   147  	ADDSUB	8(up,n,8), %r9
   148  	ADCSBB	16(up,n,8), %r10
   149  	ADCSBB	24(up,n,8), %r11
   150  	mov	(vp,n,8), %rax
   151  	sbb	R32(%rcx), R32(%rcx)
   152  	jmp	L(lo1)
   153  
   154  L(b0):	mov	%rax, %r10
   155  	mov	%rdx, %r11
   156  	mov	-8(vp,n,8), %rax
   157  	mul	%r8
   158  	or	%rax, %r11
   159  	ADDSUB	16(up,n,8), %r10
   160  	ADCSBB	24(up,n,8), %r11
   161  	mov	(vp,n,8), %rax
   162  	sbb	R32(%rcx), R32(%rcx)
   163  	mov	%rdx, %rbx
   164  	mul	%r8
   165  	or	%rax, %rbx
   166  	mov	8(vp,n,8), %rax
   167  	add	$4, n
   168  	jz	L(end)
   169  
   170  	ALIGN(8)
   171  L(top):	mov	%rdx, %r9
   172  	mul	%r8
   173  	or	%rax, %r9
   174  	mov	%r10, -16(rp,n,8)
   175  L(lo3):	mov	%rdx, %r10
   176  	mov	-16(vp,n,8), %rax
   177  	mul	%r8
   178  	or	%rax, %r10
   179  	mov	%r11, -8(rp,n,8)
   180  L(lo2):	mov	%rdx, %r11
   181  	mov	-8(vp,n,8), %rax
   182  	mul	%r8
   183  	or	%rax, %r11
   184  	add	R32(%rcx), R32(%rcx)
   185  	ADCSBB	(up,n,8), %rbx
   186  	ADCSBB	8(up,n,8), %r9
   187  	ADCSBB	16(up,n,8), %r10
   188  	ADCSBB	24(up,n,8), %r11
   189  	mov	(vp,n,8), %rax
   190  	sbb	R32(%rcx), R32(%rcx)
   191  	mov	%rbx, (rp,n,8)
   192  L(lo1):	mov	%rdx, %rbx
   193  	mul	%r8
   194  	or	%rax, %rbx
   195  	mov	%r9, 8(rp,n,8)
   196  L(lo0):	mov	8(vp,n,8), %rax
   197  	add	$4, n
   198  	jnz	L(top)
   199  
   200  L(end):	mov	%rdx, %r9
   201  	mul	%r8
   202  	or	%rax, %r9
   203  	mov	%r10, -16(rp,n,8)
   204  L(cj3):	mov	%r11, -8(rp,n,8)
   205  L(cj2):	add	R32(%rcx), R32(%rcx)
   206  	ADCSBB	(up,n,8), %rbx
   207  	ADCSBB	8(up,n,8), %r9
   208  	mov	%rbx, (rp,n,8)
   209  L(cj1):	mov	%r9, 8(rp,n,8)
   210  	mov	%rdx, %rax
   211  	ADCSBB	$0, %rax
   212  	pop	%rbx
   213  	pop	%rbp
   214  	pop	%r12
   215  	FUNC_EXIT()
   216  	ret
   217  EPILOGUE()