github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k7/addlsh1_n.asm (about)

     1  dnl  AMD K7 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
     2  
     3  dnl  Copyright 2011 Free Software Foundation, Inc.
     4  
     5  dnl  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  C This is an attempt at an addlsh1_n for x86-32, not relying on sse2 insns.
    36  C The innerloop is 2*3-way unrolled, which is best we can do with the available
    37  C registers.  It seems tricky to use the same structure for rsblsh1_n, since we
    38  C cannot feed carry between operations there.
    39  
    40  C			    cycles/limb
    41  C P5
    42  C P6 model 0-8,10-12
    43  C P6 model 9  (Banias)
    44  C P6 model 13 (Dothan)		 5.4	(worse than add_n + lshift)
    45  C P4 model 0  (Willamette)
    46  C P4 model 1  (?)
    47  C P4 model 2  (Northwood)
    48  C P4 model 3  (Prescott)
    49  C P4 model 4  (Nocona)
    50  C Intel Atom			 6
    51  C AMD K6			 ?
    52  C AMD K7			 2.5
    53  C AMD K8
    54  
    55  C This is a basic addlsh1_n for k7, atom, and perhaps some other x86-32
    56  C processors.  It uses 2*3-way unrolling, for good reasons.  Unfortunately,
    57  C that means we need an initial magic multiply.
    58  C
    59  C It is not clear how to do sublsh1_n or rsblsh1_n using the same pattern.  We
    60  C cannot do rsblsh1_n since we feed carry from the shift blocks to the
    61  C add/subtract blocks, which is right for addition but reversed for
    62  C subtraction.  We could perhaps do sublsh1_n, with some extra move insns,
    63  C without losing any time, since we're not issue limited but carry recurrency
    64  C latency.
    65  C
    66  C Breaking carry recurrency might be a good idea.  We would then need separate
    67  C registers for the shift carry and add/subtract carry, which in turn would
    68  C force is to 2*2-way unrolling.
    69  
    70  defframe(PARAM_SIZE,	16)
    71  defframe(PARAM_DBLD,	12)
    72  defframe(PARAM_SRC,	 8)
    73  defframe(PARAM_DST,	 4)
    74  
    75  dnl  re-use parameter space
    76  define(VAR_COUNT,`PARAM_DST')
    77  define(VAR_TMP,`PARAM_DBLD')
    78  
    79  ASM_START()
    80  	TEXT
    81  	ALIGN(8)
    82  PROLOGUE(mpn_addlsh1_n)
    83  deflit(`FRAME',0)
    84  
    85  define(`rp',  `%edi')
    86  define(`up',  `%esi')
    87  define(`vp',  `%ebp')
    88  
    89  	mov	$0x2aaaaaab, %eax
    90  
    91  	push	%ebx			FRAME_pushl()
    92  	mov	PARAM_SIZE, %ebx	C size
    93  
    94  	push	rp			FRAME_pushl()
    95  	mov	PARAM_DST, rp
    96  
    97  	mul	%ebx
    98  
    99  	push	up			FRAME_pushl()
   100  	mov	PARAM_SRC, up
   101  
   102  	not	%edx			C count = -(size\8)-1
   103  	mov	%edx, VAR_COUNT
   104  
   105  	push	vp			FRAME_pushl()
   106  	mov	PARAM_DBLD, vp
   107  
   108  	lea	3(%edx,%edx,2), %ecx	C count*3+3 = -(size\6)*3
   109  	xor	%edx, %edx
   110  	lea	(%ebx,%ecx,2), %ebx	C size + (count*3+3)*2 = size % 6
   111  	or	%ebx, %ebx
   112  	jz	L(exact)
   113  
   114  L(oop):
   115  ifdef(`CPU_P6',`
   116  	shr	%edx ')			C restore 2nd saved carry bit
   117  	mov	(vp), %eax
   118  	adc	%eax, %eax
   119  	rcr	%edx			C restore 1st saved carry bit
   120  	lea	4(vp), vp
   121  	adc	(up), %eax
   122  	lea	4(up), up
   123  	adc	%edx, %edx		C save a carry bit in edx
   124  ifdef(`CPU_P6',`
   125  	adc	%edx, %edx ')		C save another carry bit in edx
   126  	dec	%ebx
   127  	mov	%eax, (rp)
   128  	lea	4(rp), rp
   129  	jnz	L(oop)
   130  	mov	vp, VAR_TMP
   131  L(exact):
   132  	incl	VAR_COUNT
   133  	jz	L(end)
   134  
   135  	ALIGN(16)
   136  L(top):
   137  ifdef(`CPU_P6',`
   138  	shr	%edx ')			C restore 2nd saved carry bit
   139  	mov	(vp), %eax
   140  	adc	%eax, %eax
   141  	mov	4(vp), %ebx
   142  	adc	%ebx, %ebx
   143  	mov	8(vp), %ecx
   144  	adc	%ecx, %ecx
   145  
   146  	rcr	%edx			C restore 1st saved carry bit
   147  
   148  	adc	(up), %eax
   149  	mov	%eax, (rp)
   150  	adc	4(up), %ebx
   151  	mov	%ebx, 4(rp)
   152  	adc	8(up), %ecx
   153  	mov	%ecx, 8(rp)
   154  
   155  	mov	12(vp), %eax
   156  	adc	%eax, %eax
   157  	mov	16(vp), %ebx
   158  	adc	%ebx, %ebx
   159  	mov	20(vp), %ecx
   160  	adc	%ecx, %ecx
   161  
   162  	lea	24(vp), vp
   163  	adc	%edx, %edx		C save a carry bit in edx
   164  
   165  	adc	12(up), %eax
   166  	mov	%eax, 12(rp)
   167  	adc	16(up), %ebx
   168  	mov	%ebx, 16(rp)
   169  	adc	20(up), %ecx
   170  
   171  	lea	24(up), up
   172  
   173  ifdef(`CPU_P6',`
   174  	adc	%edx, %edx ')		C save another carry bit in edx
   175  	mov	%ecx, 20(rp)
   176  	incl	VAR_COUNT
   177  	lea	24(rp), rp
   178  	jne	L(top)
   179  
   180  L(end):
   181  	pop	vp			FRAME_popl()
   182  	pop	up			FRAME_popl()
   183  
   184  ifdef(`CPU_P6',`
   185  	xor	%eax, %eax
   186  	shr	$1, %edx
   187  	adc	%edx, %eax
   188  ',`
   189  	adc	$0, %edx
   190  	mov	%edx, %eax
   191  ')
   192  	pop	rp			FRAME_popl()
   193  	pop	%ebx			FRAME_popl()
   194  	ret
   195  EPILOGUE()
   196  ASM_END()