github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/sublsh1_n.asm (about)

     1  dnl  AMD64 mpn_sublsh1_n -- rp[] = up[] - (vp[] << 1)
     2  
     3  dnl  Copyright 2003, 2005-2007, 2011, 2012 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C	     cycles/limb
    35  C AMD K8,K9	 2.2
    36  C AMD K10	 2.2
    37  C Intel P4	12.75
    38  C Intel core2	 3.45
    39  C Intel corei	 ?
    40  C Intel atom	 ?
    41  C VIA nano	 3.25
    42  
    43  C Sometimes speed degenerates, supposedly related to that some operand
    44  C alignments cause cache conflicts.
    45  
    46  C The speed is limited by decoding/issue bandwidth.  There are 26 instructions
    47  C in the loop, which corresponds to 26/3/4 = 2.167 c/l.
    48  
    49  C INPUT PARAMETERS
    50  define(`rp',`%rdi')
    51  define(`up',`%rsi')
    52  define(`vp',`%rdx')
    53  define(`n', `%rcx')
    54  
    55  ABI_SUPPORT(DOS64)
    56  ABI_SUPPORT(STD64)
    57  
    58  ASM_START()
    59  	TEXT
    60  	ALIGN(16)
    61  PROLOGUE(mpn_sublsh1_n)
    62  	FUNC_ENTRY(4)
    63  	push	%rbx
    64  	push	%rbp
    65  
    66  	mov	(vp), %r8
    67  	mov	R32(n), R32(%rax)
    68  	lea	(rp,n,8), rp
    69  	lea	(up,n,8), up
    70  	lea	(vp,n,8), vp
    71  	neg	n
    72  	xor	R32(%rbp), R32(%rbp)
    73  	and	$3, R32(%rax)
    74  	je	L(b00)
    75  	cmp	$2, R32(%rax)
    76  	jc	L(b01)
    77  	je	L(b10)
    78  
    79  L(b11):	add	%r8, %r8
    80  	mov	8(vp,n,8), %r9
    81  	adc	%r9, %r9
    82  	mov	16(vp,n,8), %r10
    83  	adc	%r10, %r10
    84  	sbb	R32(%rax), R32(%rax)	C save scy
    85  	mov	(up,n,8), %rbp
    86  	mov	8(up,n,8), %rbx
    87  	sub	%r8, %rbp
    88  	sbb	%r9, %rbx
    89  	mov	%rbp, (rp,n,8)
    90  	mov	%rbx, 8(rp,n,8)
    91  	mov	16(up,n,8), %rbp
    92  	sbb	%r10, %rbp
    93  	mov	%rbp, 16(rp,n,8)
    94  	sbb	R32(%rbp), R32(%rbp)	C save acy
    95  	add	$3, n
    96  	jmp	L(ent)
    97  
    98  L(b10):	add	%r8, %r8
    99  	mov	8(vp,n,8), %r9
   100  	adc	%r9, %r9
   101  	sbb	R32(%rax), R32(%rax)	C save scy
   102  	mov	(up,n,8), %rbp
   103  	mov	8(up,n,8), %rbx
   104  	sub	%r8, %rbp
   105  	sbb	%r9, %rbx
   106  	mov	%rbp, (rp,n,8)
   107  	mov	%rbx, 8(rp,n,8)
   108  	sbb	R32(%rbp), R32(%rbp)	C save acy
   109  	add	$2, n
   110  	jmp	L(ent)
   111  
   112  L(b01):	add	%r8, %r8
   113  	sbb	R32(%rax), R32(%rax)	C save scy
   114  	mov	(up,n,8), %rbp
   115  	sub	%r8, %rbp
   116  	mov	%rbp, (rp,n,8)
   117  	sbb	R32(%rbp), R32(%rbp)	C save acy
   118  	inc	n
   119  L(ent):	jns	L(end)
   120  
   121  	ALIGN(16)
   122  L(top):	add	R32(%rax), R32(%rax)	C restore scy
   123  
   124  	mov	(vp,n,8), %r8
   125  L(b00):	adc	%r8, %r8
   126  	mov	8(vp,n,8), %r9
   127  	adc	%r9, %r9
   128  	mov	16(vp,n,8), %r10
   129  	adc	%r10, %r10
   130  	mov	24(vp,n,8), %r11
   131  	adc	%r11, %r11
   132  
   133  	sbb	R32(%rax), R32(%rax)	C save scy
   134  	add	R32(%rbp), R32(%rbp)	C restore acy
   135  
   136  	mov	(up,n,8), %rbp
   137  	mov	8(up,n,8), %rbx
   138  	sbb	%r8, %rbp
   139  	sbb	%r9, %rbx
   140  	mov	%rbp, (rp,n,8)
   141  	mov	%rbx, 8(rp,n,8)
   142  	mov	16(up,n,8), %rbp
   143  	mov	24(up,n,8), %rbx
   144  	sbb	%r10, %rbp
   145  	sbb	%r11, %rbx
   146  	mov	%rbp, 16(rp,n,8)
   147  	mov	%rbx, 24(rp,n,8)
   148  
   149  	sbb	R32(%rbp), R32(%rbp)	C save acy
   150  	add	$4, n
   151  	js	L(top)
   152  
   153  L(end):	add	R32(%rbp), R32(%rax)
   154  	neg	R32(%rax)
   155  
   156  	pop	%rbp
   157  	pop	%rbx
   158  	FUNC_EXIT()
   159  	ret
   160  EPILOGUE()