github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/atom/aorrlsh1_n.asm (about)

     1  dnl  AMD64 mpn_addlsh1_n, mpn_rsblsh1_n optimised for Intel Atom.
     2  dnl  Used also for AMD bd1.
     3  
     4  dnl  Contributed to the GNU project by Torbjorn Granlund.
     5  
     6  dnl  Copyright 2011, 2012 Free Software Foundation, Inc.
     7  
     8  dnl  This file is part of the GNU MP Library.
     9  dnl
    10  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    11  dnl  it under the terms of either:
    12  dnl
    13  dnl    * the GNU Lesser General Public License as published by the Free
    14  dnl      Software Foundation; either version 3 of the License, or (at your
    15  dnl      option) any later version.
    16  dnl
    17  dnl  or
    18  dnl
    19  dnl    * the GNU General Public License as published by the Free Software
    20  dnl      Foundation; either version 2 of the License, or (at your option) any
    21  dnl      later version.
    22  dnl
    23  dnl  or both in parallel, as here.
    24  dnl
    25  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    26  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    27  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    28  dnl  for more details.
    29  dnl
    30  dnl  You should have received copies of the GNU General Public License and the
    31  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    32  dnl  see https://www.gnu.org/licenses/.
    33  
    34  include(`../config.m4')
    35  
    36  C TODO
    37  C  * This code is slightly large at 433 bytes.
    38  C  * sublsh1_n.asm and this file use the same basic pattern.
    39  
    40  C	     cycles/limb
    41  C AMD K8,K9	 ?
    42  C AMD K10	 ?
    43  C AMD bd1	 2.3
    44  C AMD bobcat	 ?
    45  C Intel P4	 ?
    46  C Intel core2	 ?
    47  C Intel NHM	 ?
    48  C Intel SBR	 ?
    49  C Intel atom	 4.875	(4.75 is probably possible)
    50  C VIA nano	 ?
    51  
    52  C INPUT PARAMETERS
    53  define(`rp',       `%rdi')
    54  define(`up',       `%rsi')
    55  define(`vp',       `%rdx')
    56  define(`n',        `%rcx')
    57  define(`cy',       `%r8')
    58  
    59  ifdef(`OPERATION_addlsh1_n', `
    60    define(ADDSUB,	add)
    61    define(ADCSBB,	adc)
    62    define(func_n,	mpn_addlsh1_n)
    63    define(func_nc,	mpn_addlsh1_nc)')
    64  ifdef(`OPERATION_rsblsh1_n', `
    65    define(ADDSUB,	sub)
    66    define(ADCSBB,	sbb)
    67    define(func_n,	mpn_rsblsh1_n)
    68    define(func_nc,	mpn_rsblsh1_nc)')
    69  
    70  ABI_SUPPORT(DOS64)
    71  ABI_SUPPORT(STD64)
    72  
    73  MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_addlsh1_nc mpn_rsblsh1_n mpn_rsblsh1_nc)
    74  
    75  ASM_START()
    76  	TEXT
    77  	ALIGN(16)
    78  PROLOGUE(func_n)
    79  	FUNC_ENTRY(4)
    80  	push	%rbp
    81  	xor	R32(%rbp), R32(%rbp)
    82  L(ent):	mov	R32(n), R32(%rax)
    83  	and	$3, R32(%rax)
    84  	jz	L(b0)
    85  	cmp	$2, R32(%rax)
    86  	jz	L(b2)
    87  	jg	L(b3)
    88  
    89  L(b1):	mov	(vp), %r8
    90  	add	%r8, %r8
    91  	lea	8(vp), vp
    92  	sbb	R32(%rax), R32(%rax)	C save scy
    93  	add	R32(%rbp), R32(%rbp)	C restore acy
    94  	ADCSBB	(up), %r8
    95  	mov	%r8, (rp)
    96  	sbb	R32(%rbp), R32(%rbp)	C save acy
    97  	lea	8(up), up
    98  	lea	8(rp), rp
    99  	jmp	L(b0)
   100  
   101  L(b2):	mov	(vp), %r8
   102  	add	%r8, %r8
   103  	mov	8(vp), %r9
   104  	adc	%r9, %r9
   105  	lea	16(vp), vp
   106  	sbb	R32(%rax), R32(%rax)	C save scy
   107  	add	R32(%rbp), R32(%rbp)	C restore acy
   108  	ADCSBB	(up), %r8
   109  	mov	%r8, (rp)
   110  	ADCSBB	8(up), %r9
   111  	mov	%r9, 8(rp)
   112  	sbb	R32(%rbp), R32(%rbp)	C save acy
   113  	lea	16(up), up
   114  	lea	16(rp), rp
   115  	jmp	L(b0)
   116  
   117  L(b3):	mov	(vp), %r8
   118  	add	%r8, %r8
   119  	mov	8(vp), %r9
   120  	adc	%r9, %r9
   121  	mov	16(vp), %r10
   122  	adc	%r10, %r10
   123  	lea	24(vp), vp
   124  	sbb	R32(%rax), R32(%rax)	C save scy
   125  	add	R32(%rbp), R32(%rbp)	C restore acy
   126  	ADCSBB	(up), %r8
   127  	mov	%r8, (rp)
   128  	ADCSBB	8(up), %r9
   129  	mov	%r9, 8(rp)
   130  	ADCSBB	16(up), %r10
   131  	mov	%r10, 16(rp)
   132  	sbb	R32(%rbp), R32(%rbp)	C save acy
   133  	lea	24(up), up
   134  	lea	24(rp), rp
   135  
   136  L(b0):	test	$4, R8(n)
   137  	jz	L(skp)
   138  	add	R32(%rax), R32(%rax)	C restore scy
   139  	mov	(vp), %r8
   140  	adc	%r8, %r8
   141  	mov	8(vp), %r9
   142  	adc	%r9, %r9
   143  	mov	16(vp), %r10
   144  	adc	%r10, %r10
   145  	mov	24(vp), %r11
   146  	adc	%r11, %r11
   147  	lea	32(vp), vp
   148  	sbb	R32(%rax), R32(%rax)	C save scy
   149  	add	R32(%rbp), R32(%rbp)	C restore acy
   150  	ADCSBB	(up), %r8
   151  	mov	%r8, (rp)
   152  	ADCSBB	8(up), %r9
   153  	mov	%r9, 8(rp)
   154  	ADCSBB	16(up), %r10
   155  	mov	%r10, 16(rp)
   156  	ADCSBB	24(up), %r11
   157  	mov	%r11, 24(rp)
   158  	lea	32(up), up
   159  	lea	32(rp), rp
   160  	sbb	R32(%rbp), R32(%rbp)	C save acy
   161  
   162  L(skp):	cmp	$8, n
   163  	jl	L(rtn)
   164  
   165  	push	%r12
   166  	push	%r13
   167  	push	%r14
   168  	push	%rbx
   169  	lea	-64(rp), rp
   170  	jmp	L(x)
   171  
   172  	ALIGN(16)
   173  L(top):	add	R32(%rax), R32(%rax)	C restore scy
   174  	lea	64(rp), rp
   175  	mov	(vp), %r8
   176  	adc	%r8, %r8
   177  	mov	8(vp), %r9
   178  	adc	%r9, %r9
   179  	mov	16(vp), %r10
   180  	adc	%r10, %r10
   181  	mov	24(vp), %r11
   182  	adc	%r11, %r11
   183  	mov	32(vp), %r12
   184  	adc	%r12, %r12
   185  	mov	40(vp), %r13
   186  	adc	%r13, %r13
   187  	mov	48(vp), %r14
   188  	adc	%r14, %r14
   189  	mov	56(vp), %rbx
   190  	adc	%rbx, %rbx
   191  	lea	64(vp), vp
   192  	sbb	R32(%rax), R32(%rax)	C save scy
   193  	add	R32(%rbp), R32(%rbp)	C restore acy
   194  	ADCSBB	(up), %r8
   195  	mov	%r8, (rp)
   196  	ADCSBB	8(up), %r9
   197  	mov	%r9, 8(rp)
   198  	ADCSBB	16(up), %r10
   199  	mov	%r10, 16(rp)
   200  	ADCSBB	24(up), %r11
   201  	mov	%r11, 24(rp)
   202  	ADCSBB	32(up), %r12
   203  	mov	%r12, 32(rp)
   204  	ADCSBB	40(up), %r13
   205  	mov	%r13, 40(rp)
   206  	ADCSBB	48(up), %r14
   207  	mov	%r14, 48(rp)
   208  	ADCSBB	56(up), %rbx
   209  	mov	%rbx, 56(rp)
   210  	sbb	R32(%rbp), R32(%rbp)	C save acy
   211  	lea	64(up), up
   212  L(x):	sub	$8, n
   213  	jge	L(top)
   214  
   215  L(end):	pop	%rbx
   216  	pop	%r14
   217  	pop	%r13
   218  	pop	%r12
   219  L(rtn):
   220  ifdef(`OPERATION_addlsh1_n',`
   221  	add	R32(%rbp), R32(%rax)
   222  	neg	R32(%rax)')
   223  ifdef(`OPERATION_rsblsh1_n',`
   224  	sub	R32(%rax), R32(%rbp)
   225  	movslq	R32(%rbp), %rax')
   226  
   227  	pop	%rbp
   228  	FUNC_EXIT()
   229  	ret
   230  EPILOGUE()
   231  PROLOGUE(func_nc)
   232  	FUNC_ENTRY(4)
   233  IFDOS(`	mov	56(%rsp), %r8	')
   234  	push	%rbp
   235  	neg	%r8			C set CF
   236  	sbb	R32(%rbp), R32(%rbp)	C save acy
   237  	jmp	L(ent)
   238  EPILOGUE()