github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium4/sse2/addlsh1_n.asm (about)

     1  dnl  Intel Pentium-4 mpn_addlsh1_n -- mpn x+2*y.
     2  
     3  dnl  Copyright 2001-2004, 2006 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C					cycles/limb
    35  C			     dst!=src1,2  dst==src1  dst==src2
    36  C P6 model 0-8,10-12		-
    37  C P6 model 9   (Banias)		?
    38  C P6 model 13  (Dothan)		?
    39  C P4 model 0-1 (Willamette)	?
    40  C P4 model 2   (Northwood)	4.25	     6		6
    41  C P4 model 3-4 (Prescott)	5	     8.5	8.5
    42  
    43  C The slightly strange combination of indexing and pointer incrementing
    44  C that's used seems to work best.  Not sure why, but %ecx,4 with src1 and/or
    45  C src2 is a slowdown.
    46  C
    47  C The dependent chain is simply the paddq of x+2*y to the previous carry,
    48  C then psrlq to get the new carry.  That makes 4 c/l the target speed, which
    49  C is almost achieved for separate src/dst but when src==dst the write
    50  C combining anomalies slow it down.
    51  
    52  defframe(PARAM_SIZE, 16)
    53  defframe(PARAM_SRC2, 12)
    54  defframe(PARAM_SRC1, 8)
    55  defframe(PARAM_DST,  4)
    56  
    57  dnl  re-use parameter space
    58  define(SAVE_EBX,`PARAM_SRC1')
    59  
    60  	TEXT
    61  	ALIGN(8)
    62  
    63  PROLOGUE(mpn_addlsh1_n)
    64  deflit(`FRAME',0)
    65  
    66  	mov	PARAM_SRC1, %eax
    67  	mov	%ebx, SAVE_EBX
    68  
    69  	mov	PARAM_SRC2, %ebx
    70  	pxor	%mm0, %mm0		C initial carry
    71  
    72  	mov	PARAM_DST, %edx
    73  
    74  	mov	PARAM_SIZE, %ecx
    75  
    76  	lea	(%edx,%ecx,4), %edx	C dst end
    77  	neg	%ecx			C -size
    78  
    79  L(top):
    80  	C eax	src1 end
    81  	C ebx	src2 end
    82  	C ecx	counter, limbs, negative
    83  	C edx	dst end
    84  	C mm0	carry
    85  
    86  	movd	(%ebx), %mm2
    87  	movd	(%eax), %mm1
    88  	psrlq	$32, %mm0
    89  	lea	4(%eax), %eax
    90  	lea	4(%ebx), %ebx
    91  
    92  	psllq	$1, %mm2
    93  	paddq	%mm2, %mm1
    94  
    95  	paddq	%mm1, %mm0
    96  
    97  	movd	%mm0, (%edx,%ecx,4)
    98  	add	$1, %ecx
    99  	jnz	L(top)
   100  
   101  
   102  	psrlq	$32, %mm0
   103  	mov	SAVE_EBX, %ebx
   104  	movd	%mm0, %eax
   105  	emms
   106  	ret
   107  
   108  EPILOGUE()