github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/p6/aorsmul_1.asm

github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/p6/aorsmul_1.asm (about)

     1  dnl  Intel P6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
     2  
     3  dnl  Copyright 1999-2002, 2005 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C			    cycles/limb
    35  C P5
    36  C P6 model 0-8,10-12		 6.44
    37  C P6 model 9  (Banias)		 6.15
    38  C P6 model 13 (Dothan)		 6.11
    39  C P4 model 0  (Willamette)
    40  C P4 model 1  (?)
    41  C P4 model 2  (Northwood)
    42  C P4 model 3  (Prescott)
    43  C P4 model 4  (Nocona)
    44  C AMD K6
    45  C AMD K7
    46  C AMD K8
    47  
    48  
    49  dnl  P6 UNROLL_COUNT cycles/limb
    50  dnl          8           6.7
    51  dnl         16           6.35
    52  dnl         32           6.3
    53  dnl         64           6.3
    54  dnl  Maximum possible with the current code is 64.
    55  
    56  deflit(UNROLL_COUNT, 16)
    57  
    58  
    59  ifdef(`OPERATION_addmul_1', `
    60  	define(M4_inst,        addl)
    61  	define(M4_function_1,  mpn_addmul_1)
    62  	define(M4_function_1c, mpn_addmul_1c)
    63  	define(M4_description, add it to)
    64  	define(M4_desc_retval, carry)
    65  ',`ifdef(`OPERATION_submul_1', `
    66  	define(M4_inst,        subl)
    67  	define(M4_function_1,  mpn_submul_1)
    68  	define(M4_function_1c, mpn_submul_1c)
    69  	define(M4_description, subtract it from)
    70  	define(M4_desc_retval, borrow)
    71  ',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
    72  ')')')
    73  
    74  MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
    75  
    76  
    77  C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
    78  C                            mp_limb_t mult);
    79  C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
    80  C                             mp_limb_t mult, mp_limb_t carry);
    81  C
    82  C Calculate src,size multiplied by mult and M4_description dst,size.
    83  C Return the M4_desc_retval limb from the top of the result.
    84  C
    85  C This code is pretty much the same as the K6 code.  The unrolled loop is
    86  C the same, but there's just a few scheduling tweaks in the setups and the
    87  C simple loop.
    88  C
    89  C A number of variations have been tried for the unrolled loop, with one or
    90  C two carries, and with loads scheduled earlier, but nothing faster than 6
    91  C cycles/limb has been found.
    92  
    93  ifdef(`PIC',`
    94  deflit(UNROLL_THRESHOLD, 5)
    95  ',`
    96  deflit(UNROLL_THRESHOLD, 5)
    97  ')
    98  
    99  defframe(PARAM_CARRY,     20)
   100  defframe(PARAM_MULTIPLIER,16)
   101  defframe(PARAM_SIZE,      12)
   102  defframe(PARAM_SRC,       8)
   103  defframe(PARAM_DST,       4)
   104  
   105  	TEXT
   106  	ALIGN(32)
   107  
   108  PROLOGUE(M4_function_1c)
   109  	pushl	%ebx
   110  deflit(`FRAME',4)
   111  	movl	PARAM_CARRY, %ebx
   112  	jmp	L(start_nc)
   113  EPILOGUE()
   114  
   115  PROLOGUE(M4_function_1)
   116  	push	%ebx
   117  deflit(`FRAME',4)
   118  	xorl	%ebx, %ebx	C initial carry
   119  
   120  L(start_nc):
   121  	movl	PARAM_SIZE, %ecx
   122  	pushl	%esi
   123  deflit(`FRAME',8)
   124  
   125  	movl	PARAM_SRC, %esi
   126  	pushl	%edi
   127  deflit(`FRAME',12)
   128  
   129  	movl	PARAM_DST, %edi
   130  	pushl	%ebp
   131  deflit(`FRAME',16)
   132  	cmpl	$UNROLL_THRESHOLD, %ecx
   133  
   134  	movl	PARAM_MULTIPLIER, %ebp
   135  	jae	L(unroll)
   136  
   137  
   138  	C simple loop
   139  	C this is offset 0x22, so close enough to aligned
   140  L(simple):
   141  	C eax	scratch
   142  	C ebx	carry
   143  	C ecx	counter
   144  	C edx	scratch
   145  	C esi	src
   146  	C edi	dst
   147  	C ebp	multiplier
   148  
   149  	movl	(%esi), %eax
   150  	addl	$4, %edi
   151  
   152  	mull	%ebp
   153  
   154  	addl	%ebx, %eax
   155  	adcl	$0, %edx
   156  
   157  	M4_inst	%eax, -4(%edi)
   158  	movl	%edx, %ebx
   159  
   160  	adcl	$0, %ebx
   161  	decl	%ecx
   162  
   163  	leal	4(%esi), %esi
   164  	jnz	L(simple)
   165  
   166  
   167  	popl	%ebp
   168  	popl	%edi
   169  
   170  	popl	%esi
   171  	movl	%ebx, %eax
   172  
   173  	popl	%ebx
   174  	ret
   175  
   176  
   177  
   178  C------------------------------------------------------------------------------
   179  C VAR_JUMP holds the computed jump temporarily because there's not enough
   180  C registers when doing the mul for the initial two carry limbs.
   181  C
   182  C The add/adc for the initial carry in %ebx is necessary only for the
   183  C mpn_add/submul_1c entry points.  Duplicating the startup code to
   184  C eliminate this for the plain mpn_add/submul_1 doesn't seem like a good
   185  C idea.
   186  
   187  dnl  overlapping with parameters already fetched
   188  define(VAR_COUNTER,`PARAM_SIZE')
   189  define(VAR_JUMP,   `PARAM_DST')
   190  
   191  	C this is offset 0x43, so close enough to aligned
   192  L(unroll):
   193  	C eax
   194  	C ebx	initial carry
   195  	C ecx	size
   196  	C edx
   197  	C esi	src
   198  	C edi	dst
   199  	C ebp
   200  
   201  	movl	%ecx, %edx
   202  	decl	%ecx
   203  
   204  	subl	$2, %edx
   205  	negl	%ecx
   206  
   207  	shrl	$UNROLL_LOG2, %edx
   208  	andl	$UNROLL_MASK, %ecx
   209  
   210  	movl	%edx, VAR_COUNTER
   211  	movl	%ecx, %edx
   212  
   213  	C 15 code bytes per limb
   214  ifdef(`PIC',`
   215  	call	L(pic_calc)
   216  L(here):
   217  ',`
   218  	shll	$4, %edx
   219  	negl	%ecx
   220  
   221  	leal	L(entry) (%edx,%ecx,1), %edx
   222  ')
   223  	movl	(%esi), %eax		C src low limb
   224  
   225  	movl	%edx, VAR_JUMP
   226  	leal	ifelse(UNROLL_BYTES,256,128+) 4(%esi,%ecx,4), %esi
   227  
   228  	mull	%ebp
   229  
   230  	addl	%ebx, %eax	C initial carry (from _1c)
   231  	adcl	$0, %edx
   232  
   233  	movl	%edx, %ebx	C high carry
   234  	leal	ifelse(UNROLL_BYTES,256,128) (%edi,%ecx,4), %edi
   235  
   236  	movl	VAR_JUMP, %edx
   237  	testl	$1, %ecx
   238  	movl	%eax, %ecx	C low carry
   239  
   240  	cmovnz(	%ebx, %ecx)	C high,low carry other way around
   241  	cmovnz(	%eax, %ebx)
   242  
   243  	jmp	*%edx
   244  
   245  
   246  ifdef(`PIC',`
   247  L(pic_calc):
   248  	shll	$4, %edx
   249  	negl	%ecx
   250  
   251  	C See mpn/x86/README about old gas bugs
   252  	leal	(%edx,%ecx,1), %edx
   253  	addl	$L(entry)-L(here), %edx
   254  
   255  	addl	(%esp), %edx
   256  
   257  	ret_internal
   258  ')
   259  
   260  
   261  C -----------------------------------------------------------
   262  	ALIGN(32)
   263  L(top):
   264  deflit(`FRAME',16)
   265  	C eax	scratch
   266  	C ebx	carry hi
   267  	C ecx	carry lo
   268  	C edx	scratch
   269  	C esi	src
   270  	C edi	dst
   271  	C ebp	multiplier
   272  	C
   273  	C VAR_COUNTER	loop counter
   274  	C
   275  	C 15 code bytes per limb
   276  
   277  	addl	$UNROLL_BYTES, %edi
   278  
   279  L(entry):
   280  deflit(CHUNK_COUNT,2)
   281  forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
   282  	deflit(`disp0', eval(i*4*CHUNK_COUNT ifelse(UNROLL_BYTES,256,-128)))
   283  	deflit(`disp1', eval(disp0 + 4))
   284  
   285  Zdisp(	movl,	disp0,(%esi), %eax)
   286  	mull	%ebp
   287  Zdisp(	M4_inst,%ecx, disp0,(%edi))
   288  	adcl	%eax, %ebx
   289  	movl	%edx, %ecx
   290  	adcl	$0, %ecx
   291  
   292  	movl	disp1(%esi), %eax
   293  	mull	%ebp
   294  	M4_inst	%ebx, disp1(%edi)
   295  	adcl	%eax, %ecx
   296  	movl	%edx, %ebx
   297  	adcl	$0, %ebx
   298  ')
   299  
   300  	decl	VAR_COUNTER
   301  	leal	UNROLL_BYTES(%esi), %esi
   302  
   303  	jns	L(top)
   304  
   305  
   306  deflit(`disp0',	eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128)))
   307  
   308  	M4_inst	%ecx, disp0(%edi)
   309  	movl	%ebx, %eax
   310  
   311  	popl	%ebp
   312  	popl	%edi
   313  
   314  	popl	%esi
   315  	popl	%ebx
   316  	adcl	$0, %eax
   317  
   318  	ret
   319  
   320  EPILOGUE()