github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/lshsub_n.asm (about)

     1  dnl  AMD64 mpn_lshsub_n.  R = 2^k(U - V).
     2  
     3  dnl  Copyright 2006, 2011, 2012 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C	     cycles/limb
    35  C AMD K8,K9	 3.15	(mpn_sub_n + mpn_lshift costs about 4 c/l)
    36  C AMD K10	 3.15	(mpn_sub_n + mpn_lshift costs about 4 c/l)
    37  C Intel P4	16.5
    38  C Intel core2	 4.35
    39  C Intel corei	 ?
    40  C Intel atom	 ?
    41  C VIA nano	 ?
    42  
    43  C This was written quickly and not optimized at all, but it runs very well on
    44  C K8.  But perhaps one could get under 3 c/l.  Ideas:
    45  C   1) Use indexing to save the 3 LEA
    46  C   2) Write reasonable feed-in code
    47  C   3) Be more clever about register usage
    48  C   4) Unroll more, handling CL negation, carry save/restore cost much now
    49  C   5) Reschedule
    50  
    51  C INPUT PARAMETERS
    52  define(`rp',	`%rdi')
    53  define(`up',	`%rsi')
    54  define(`vp',	`%rdx')
    55  define(`n',	`%rcx')
    56  define(`cnt',	`%r8')
    57  
    58  ABI_SUPPORT(DOS64)
    59  ABI_SUPPORT(STD64)
    60  
    61  ASM_START()
    62  	TEXT
    63  	ALIGN(16)
    64  PROLOGUE(mpn_lshsub_n)
    65  	FUNC_ENTRY(4)
    66  IFDOS(`	mov	56(%rsp), %r8d	')
    67  
    68  	push	%r12
    69  	push	%r13
    70  	push	%r14
    71  	push	%r15
    72  	push	%rbx
    73  
    74  	mov	n, %rax
    75  	xor	R32(%rbx), R32(%rbx)	C clear carry save register
    76  	mov	R32(%r8), R32(%rcx)	C shift count
    77  	xor	R32(%r15), R32(%r15)	C limb carry
    78  
    79  	mov	R32(%rax), R32(%r11)
    80  	and	$3, R32(%r11)
    81  	je	L(4)
    82  	sub	$1, R32(%r11)
    83  
    84  L(oopette):
    85  	add	R32(%rbx), R32(%rbx)	C restore carry flag
    86  	mov	0(up), %r8
    87  	lea	8(up), up
    88  	sbb	0(vp), %r8
    89  	mov	%r8, %r12
    90  	sbb	R32(%rbx), R32(%rbx)	C save carry flag
    91  	shl	R8(%rcx), %r8
    92  	or	%r15, %r8
    93  	mov	%r12, %r15
    94  	lea	8(vp), vp
    95  	neg	R8(%rcx)
    96  	shr	R8(%rcx), %r15
    97  	neg	R8(%rcx)
    98  	mov	%r8, 0(rp)
    99  	lea	8(rp), rp
   100  	sub	$1, R32(%r11)
   101  	jnc	L(oopette)
   102  
   103  L(4):
   104  	sub	$4, %rax
   105  	jc	L(end)
   106  
   107  	ALIGN(16)
   108  L(oop):
   109  	add	R32(%rbx), R32(%rbx)	C restore carry flag
   110  
   111  	mov	0(up), %r8
   112  	mov	8(up), %r9
   113  	mov	16(up), %r10
   114  	mov	24(up), %r11
   115  
   116  	lea	32(up), up
   117  
   118  	sbb	0(vp), %r8
   119  	mov	%r8, %r12
   120  	sbb	8(vp), %r9
   121  	mov	%r9, %r13
   122  	sbb	16(vp), %r10
   123  	mov	%r10, %r14
   124  	sbb	24(vp), %r11
   125  
   126  	sbb	R32(%rbx), R32(%rbx)	C save carry flag
   127  
   128  	shl	R8(%rcx), %r8
   129  	shl	R8(%rcx), %r9
   130  	shl	R8(%rcx), %r10
   131  	or	%r15, %r8
   132  	mov	%r11, %r15
   133  	shl	R8(%rcx), %r11
   134  
   135  	lea	32(vp), vp
   136  
   137  	neg	R8(%rcx)
   138  
   139  	shr	R8(%rcx), %r12
   140  	shr	R8(%rcx), %r13
   141  	shr	R8(%rcx), %r14
   142  	shr	R8(%rcx), %r15		C used next loop
   143  
   144  	or	%r12, %r9
   145  	or	%r13, %r10
   146  	or	%r14, %r11
   147  
   148  	neg	R8(%rcx)
   149  
   150  	mov	%r8, 0(rp)
   151  	mov	%r9, 8(rp)
   152  	mov	%r10, 16(rp)
   153  	mov	%r11, 24(rp)
   154  
   155  	lea	32(rp), rp
   156  
   157  	sub	$4, %rax
   158  	jnc	L(oop)
   159  L(end):
   160  	neg	R32(%rbx)
   161  	shl	R8(%rcx), %rbx
   162  	adc	%r15, %rbx
   163  	mov	%rbx, %rax
   164  	pop	%rbx
   165  	pop	%r15
   166  	pop	%r14
   167  	pop	%r13
   168  	pop	%r12
   169  
   170  	FUNC_EXIT()
   171  	ret
   172  EPILOGUE()