github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/aorrlsh_n.asm (about)

     1  dnl  AMD64 mpn_addlsh_n and mpn_rsblsh_n.  R = V2^k +- U.
     2  
     3  dnl  Copyright 2006, 2010-2012 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C	     cycles/limb
    35  C AMD K8,K9	 3.1	< 3.85 for lshift + add_n
    36  C AMD K10	 3.1	< 3.85 for lshift + add_n
    37  C Intel P4	14.6	> 7.33 for lshift + add_n
    38  C Intel core2	 3.87	> 3.27 for lshift + add_n
    39  C Intel NHM	 4	> 3.75 for lshift + add_n
    40  C Intel SBR	(5.8)	> 3.46 for lshift + add_n
    41  C Intel atom	(7.75)	< 8.75 for lshift + add_n
    42  C VIA nano	 4.7	< 6.25 for lshift + add_n
    43  
    44  C This was written quickly and not optimized at all.  Surely one could get
    45  C closer to 3 c/l or perhaps even under 3 c/l.  Ideas:
    46  C   1) Use indexing to save the 3 LEA
    47  C   2) Write reasonable feed-in code
    48  C   3) Be more clever about register usage
    49  C   4) Unroll more, handling CL negation, carry save/restore cost much now
    50  C   5) Reschedule
    51  
    52  C INPUT PARAMETERS
    53  define(`rp',	`%rdi')
    54  define(`up',	`%rsi')
    55  define(`vp',	`%rdx')
    56  define(`n',	`%rcx')
    57  define(`cnt',	`%r8')
    58  
    59  ifdef(`OPERATION_addlsh_n',`
    60    define(ADCSBB,       `adc')
    61    define(func, mpn_addlsh_n)
    62  ')
    63  ifdef(`OPERATION_rsblsh_n',`
    64    define(ADCSBB,       `sbb')
    65    define(func, mpn_rsblsh_n)
    66  ')
    67  
    68  MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n)
    69  
    70  ABI_SUPPORT(DOS64)
    71  ABI_SUPPORT(STD64)
    72  
    73  ASM_START()
    74  	TEXT
    75  	ALIGN(16)
    76  PROLOGUE(func)
    77  	FUNC_ENTRY(4)
    78  IFDOS(`	mov	56(%rsp), %r8d	')
    79  	push	%r12
    80  	push	%r13
    81  	push	%r14
    82  	push	%rbp
    83  	push	%rbx
    84  
    85  	mov	n, %rax
    86  	xor	R32(%rbx), R32(%rbx)	C clear carry save register
    87  	mov	R32(%r8), R32(%rcx)	C shift count
    88  	xor	R32(%rbp), R32(%rbp)	C limb carry
    89  
    90  	mov	R32(%rax), R32(%r11)
    91  	and	$3, R32(%r11)
    92  	je	L(4)
    93  	sub	$1, R32(%r11)
    94  
    95  L(012):	mov	(vp), %r8
    96  	mov	%r8, %r12
    97  	shl	R8(%rcx), %r8
    98  	or	%rbp, %r8
    99  	neg	R8(%rcx)
   100  	mov	%r12, %rbp
   101  	shr	R8(%rcx), %rbp
   102  	neg	R8(%rcx)
   103  	add	R32(%rbx), R32(%rbx)
   104  	ADCSBB	(up), %r8
   105  	mov	%r8, (rp)
   106  	sbb	R32(%rbx), R32(%rbx)
   107  	lea	8(up), up
   108  	lea	8(vp), vp
   109  	lea	8(rp), rp
   110  	sub	$1, R32(%r11)
   111  	jnc	L(012)
   112  
   113  L(4):	sub	$4, %rax
   114  	jc	L(end)
   115  
   116  	ALIGN(16)
   117  L(top):	mov	(vp), %r8
   118  	mov	%r8, %r12
   119  	mov	8(vp), %r9
   120  	mov	%r9, %r13
   121  	mov	16(vp), %r10
   122  	mov	%r10, %r14
   123  	mov	24(vp), %r11
   124  
   125  	shl	R8(%rcx), %r8
   126  	shl	R8(%rcx), %r9
   127  	shl	R8(%rcx), %r10
   128  	or	%rbp, %r8
   129  	mov	%r11, %rbp
   130  	shl	R8(%rcx), %r11
   131  
   132  	neg	R8(%rcx)
   133  
   134  	shr	R8(%rcx), %r12
   135  	shr	R8(%rcx), %r13
   136  	shr	R8(%rcx), %r14
   137  	shr	R8(%rcx), %rbp		C used next iteration
   138  
   139  	or	%r12, %r9
   140  	or	%r13, %r10
   141  	or	%r14, %r11
   142  
   143  	neg	R8(%rcx)
   144  
   145  	add	R32(%rbx), R32(%rbx)	C restore carry flag
   146  
   147  	ADCSBB	(up), %r8
   148  	ADCSBB	8(up), %r9
   149  	ADCSBB	16(up), %r10
   150  	ADCSBB	24(up), %r11
   151  
   152  	mov	%r8, (rp)
   153  	mov	%r9, 8(rp)
   154  	mov	%r10, 16(rp)
   155  	mov	%r11, 24(rp)
   156  
   157  	sbb	R32(%rbx), R32(%rbx)	C save carry flag
   158  
   159  	lea	32(up), up
   160  	lea	32(vp), vp
   161  	lea	32(rp), rp
   162  
   163  	sub	$4, %rax
   164  	jnc	L(top)
   165  
   166  L(end):	add	R32(%rbx), R32(%rbx)
   167  	ADCSBB	$0, %rbp
   168  	mov	%rbp, %rax
   169  	pop	%rbx
   170  	pop	%rbp
   171  	pop	%r14
   172  	pop	%r13
   173  	pop	%r12
   174  	FUNC_EXIT()
   175  	ret
   176  EPILOGUE()