github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k6/aorsmul_1.asm (about)

     1  dnl  AMD K6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
     2  
     3  dnl  Copyright 1999-2003, 2005 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C			    cycles/limb
    35  C P5
    36  C P6 model 0-8,10-12		 5.94
    37  C P6 model 9  (Banias)		 5.51
    38  C P6 model 13 (Dothan)		 5.57
    39  C P4 model 0  (Willamette)
    40  C P4 model 1  (?)
    41  C P4 model 2  (Northwood)
    42  C P4 model 3  (Prescott)
    43  C P4 model 4  (Nocona)
    44  C AMD K6			7.65-8.5 (data dependent)
    45  C AMD K7
    46  C AMD K8
    47  
    48  
    49  dnl  K6:           large multipliers  small multipliers
    50  dnl  UNROLL_COUNT    cycles/limb       cycles/limb
    51  dnl        4             9.5              7.78
    52  dnl        8             9.0              7.78
    53  dnl       16             8.4              7.65
    54  dnl       32             8.4              8.2
    55  dnl
    56  dnl  Maximum possible unrolling with the current code is 32.
    57  dnl
    58  dnl  Unrolling to 16 limbs/loop makes the unrolled loop fit exactly in a 256
    59  dnl  byte block, which might explain the good speed at that unrolling.
    60  
    61  deflit(UNROLL_COUNT, 16)
    62  
    63  
    64  ifdef(`OPERATION_addmul_1', `
    65  	define(M4_inst,        addl)
    66  	define(M4_function_1,  mpn_addmul_1)
    67  	define(M4_function_1c, mpn_addmul_1c)
    68  ',`ifdef(`OPERATION_submul_1', `
    69  	define(M4_inst,        subl)
    70  	define(M4_function_1,  mpn_submul_1)
    71  	define(M4_function_1c, mpn_submul_1c)
    72  ',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
    73  ')')')
    74  
    75  MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
    76  
    77  
    78  C mp_limb_t mpn_addmul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
    79  C                         mp_limb_t mult);
    80  C mp_limb_t mpn_addmul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
    81  C                          mp_limb_t mult, mp_limb_t carry);
    82  C mp_limb_t mpn_submul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
    83  C                         mp_limb_t mult);
    84  C mp_limb_t mpn_submul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
    85  C                          mp_limb_t mult, mp_limb_t carry);
    86  C
    87  C The jadcl0()s in the unrolled loop makes the speed data dependent.  Small
    88  C multipliers (most significant few bits clear) result in few carry bits and
    89  C speeds up to 7.65 cycles/limb are attained.  Large multipliers (most
    90  C significant few bits set) make the carry bits 50/50 and lead to something
    91  C more like 8.4 c/l.  With adcl's both of these would be 9.3 c/l.
    92  C
    93  C It's important that the gains for jadcl0 on small multipliers don't come
    94  C at the cost of slowing down other data.  Tests on uniformly distributed
    95  C random data, designed to confound branch prediction, show about a 7%
    96  C speed-up using jadcl0 over adcl (8.93 versus 9.57 cycles/limb, with all
    97  C overheads included).
    98  C
    99  C In the simple loop, jadcl0() measures slower than adcl (11.9-14.7 versus
   100  C 11.0 cycles/limb), and hence isn't used.
   101  C
   102  C In the simple loop, note that running ecx from negative to zero and using
   103  C it as an index in the two movs wouldn't help.  It would save one
   104  C instruction (2*addl+loop becoming incl+jnz), but there's nothing unpaired
   105  C that would be collapsed by this.
   106  C
   107  C Attempts at a simpler main loop, with less unrolling, haven't yielded much
   108  C success, generally running over 9 c/l.
   109  C
   110  C
   111  C jadcl0
   112  C ------
   113  C
   114  C jadcl0() being faster than adcl $0 seems to be an artifact of two things,
   115  C firstly the instruction decoding and secondly the fact that there's a
   116  C carry bit for the jadcl0 only on average about 1/4 of the time.
   117  C
   118  C The code in the unrolled loop decodes something like the following.
   119  C
   120  C                                         decode cycles
   121  C		mull	%ebp                    2
   122  C		M4_inst	%esi, disp(%edi)        1
   123  C		adcl	%eax, %ecx              2
   124  C		movl	%edx, %esi            \ 1
   125  C		jnc	1f                    /
   126  C		incl	%esi                  \ 1
   127  C	1:	movl	disp(%ebx), %eax      /
   128  C                                              ---
   129  C                                               7
   130  C
   131  C In a back-to-back style test this measures 7 with the jnc not taken, or 8
   132  C with it taken (both when correctly predicted).  This is opposite to the
   133  C measurements showing small multipliers running faster than large ones.
   134  C Don't really know why.
   135  C
   136  C It's not clear how much branch misprediction might be costing.  The K6
   137  C doco says it will be 1 to 4 cycles, but presumably it's near the low end
   138  C of that range to get the measured results.
   139  C
   140  C
   141  C In the code the two carries are more or less the preceding mul product and
   142  C the calculation is roughly
   143  C
   144  C	x*y + u*b+v
   145  C
   146  C where b=2^32 is the size of a limb, x*y is the two carry limbs, and u and
   147  C v are the two limbs it's added to (being the low of the next mul, and a
   148  C limb from the destination).
   149  C
   150  C To get a carry requires x*y+u*b+v >= b^2, which is u*b+v >= b^2-x*y, and
   151  C there are b^2-(b^2-x*y) = x*y many such values, giving a probability of
   152  C x*y/b^2.  If x, y, u and v are random and uniformly distributed between 0
   153  C and b-1, then the total probability can be summed over x and y,
   154  C
   155  C	 1    b-1 b-1 x*y    1    b*(b-1)   b*(b-1)
   156  C	--- * sum sum --- = --- * ------- * ------- = 1/4
   157  C       b^2   x=0 y=1 b^2   b^4      2         2
   158  C
   159  C Actually it's a very tiny bit less than 1/4 of course.  If y is fixed,
   160  C then the probability is 1/2*y/b thus varying linearly between 0 and 1/2.
   161  
   162  
   163  ifdef(`PIC',`
   164  deflit(UNROLL_THRESHOLD, 9)
   165  ',`
   166  deflit(UNROLL_THRESHOLD, 6)
   167  ')
   168  
   169  defframe(PARAM_CARRY,     20)
   170  defframe(PARAM_MULTIPLIER,16)
   171  defframe(PARAM_SIZE,      12)
   172  defframe(PARAM_SRC,       8)
   173  defframe(PARAM_DST,       4)
   174  
   175  	TEXT
   176  	ALIGN(32)
   177  
   178  PROLOGUE(M4_function_1c)
   179  	pushl	%esi
   180  deflit(`FRAME',4)
   181  	movl	PARAM_CARRY, %esi
   182  	jmp	L(start_nc)
   183  EPILOGUE()
   184  
   185  PROLOGUE(M4_function_1)
   186  	push	%esi
   187  deflit(`FRAME',4)
   188  	xorl	%esi, %esi	C initial carry
   189  
   190  L(start_nc):
   191  	movl	PARAM_SIZE, %ecx
   192  	pushl	%ebx
   193  deflit(`FRAME',8)
   194  
   195  	movl	PARAM_SRC, %ebx
   196  	pushl	%edi
   197  deflit(`FRAME',12)
   198  
   199  	cmpl	$UNROLL_THRESHOLD, %ecx
   200  	movl	PARAM_DST, %edi
   201  
   202  	pushl	%ebp
   203  deflit(`FRAME',16)
   204  	jae	L(unroll)
   205  
   206  
   207  	C simple loop
   208  
   209  	movl	PARAM_MULTIPLIER, %ebp
   210  
   211  L(simple):
   212  	C eax	scratch
   213  	C ebx	src
   214  	C ecx	counter
   215  	C edx	scratch
   216  	C esi	carry
   217  	C edi	dst
   218  	C ebp	multiplier
   219  
   220  	movl	(%ebx), %eax
   221  	addl	$4, %ebx
   222  
   223  	mull	%ebp
   224  
   225  	addl	$4, %edi
   226  	addl	%esi, %eax
   227  
   228  	adcl	$0, %edx
   229  
   230  	M4_inst	%eax, -4(%edi)
   231  
   232  	adcl	$0, %edx
   233  
   234  	movl	%edx, %esi
   235  	loop	L(simple)
   236  
   237  
   238  	popl	%ebp
   239  	popl	%edi
   240  
   241  	popl	%ebx
   242  	movl	%esi, %eax
   243  
   244  	popl	%esi
   245  	ret
   246  
   247  
   248  
   249  C -----------------------------------------------------------------------------
   250  C The unrolled loop uses a "two carry limbs" scheme.  At the top of the loop
   251  C the carries are ecx=lo, esi=hi, then they swap for each limb processed.
   252  C For the computed jump an odd size means they start one way around, an even
   253  C size the other.
   254  C
   255  C VAR_JUMP holds the computed jump temporarily because there's not enough
   256  C registers at the point of doing the mul for the initial two carry limbs.
   257  C
   258  C The add/adc for the initial carry in %esi is necessary only for the
   259  C mpn_addmul/submul_1c entry points.  Duplicating the startup code to
   260  C eliminate this for the plain mpn_add/submul_1 doesn't seem like a good
   261  C idea.
   262  
   263  dnl  overlapping with parameters already fetched
   264  define(VAR_COUNTER, `PARAM_SIZE')
   265  define(VAR_JUMP,    `PARAM_DST')
   266  
   267  L(unroll):
   268  	C eax
   269  	C ebx	src
   270  	C ecx	size
   271  	C edx
   272  	C esi	initial carry
   273  	C edi	dst
   274  	C ebp
   275  
   276  	movl	%ecx, %edx
   277  	decl	%ecx
   278  
   279  	subl	$2, %edx
   280  	negl	%ecx
   281  
   282  	shrl	$UNROLL_LOG2, %edx
   283  	andl	$UNROLL_MASK, %ecx
   284  
   285  	movl	%edx, VAR_COUNTER
   286  	movl	%ecx, %edx
   287  
   288  	shll	$4, %edx
   289  	negl	%ecx
   290  
   291  	C 15 code bytes per limb
   292  ifdef(`PIC',`
   293  	call	L(pic_calc)
   294  L(here):
   295  ',`
   296  	leal	L(entry) (%edx,%ecx,1), %edx
   297  ')
   298  	movl	(%ebx), %eax		C src low limb
   299  
   300  	movl	PARAM_MULTIPLIER, %ebp
   301  	movl	%edx, VAR_JUMP
   302  
   303  	mull	%ebp
   304  
   305  	addl	%esi, %eax	C initial carry (from _1c)
   306  	jadcl0(	%edx)
   307  
   308  
   309  	leal	4(%ebx,%ecx,4), %ebx
   310  	movl	%edx, %esi	C high carry
   311  
   312  	movl	VAR_JUMP, %edx
   313  	leal	(%edi,%ecx,4), %edi
   314  
   315  	testl	$1, %ecx
   316  	movl	%eax, %ecx	C low carry
   317  
   318  	jz	L(noswap)
   319  	movl	%esi, %ecx	C high,low carry other way around
   320  
   321  	movl	%eax, %esi
   322  L(noswap):
   323  
   324  	jmp	*%edx
   325  
   326  
   327  ifdef(`PIC',`
   328  L(pic_calc):
   329  	C See mpn/x86/README about old gas bugs
   330  	leal	(%edx,%ecx,1), %edx
   331  	addl	$L(entry)-L(here), %edx
   332  	addl	(%esp), %edx
   333  	ret_internal
   334  ')
   335  
   336  
   337  C -----------------------------------------------------------
   338  	ALIGN(32)
   339  L(top):
   340  deflit(`FRAME',16)
   341  	C eax	scratch
   342  	C ebx	src
   343  	C ecx	carry lo
   344  	C edx	scratch
   345  	C esi	carry hi
   346  	C edi	dst
   347  	C ebp	multiplier
   348  	C
   349  	C 15 code bytes per limb
   350  
   351  	leal	UNROLL_BYTES(%edi), %edi
   352  
   353  L(entry):
   354  forloop(`i', 0, UNROLL_COUNT/2-1, `
   355  	deflit(`disp0', eval(2*i*4))
   356  	deflit(`disp1', eval(disp0 + 4))
   357  
   358  Zdisp(	movl,	disp0,(%ebx), %eax)
   359  	mull	%ebp
   360  Zdisp(	M4_inst,%ecx, disp0,(%edi))
   361  	adcl	%eax, %esi
   362  	movl	%edx, %ecx
   363  	jadcl0(	%ecx)
   364  
   365  	movl	disp1(%ebx), %eax
   366  	mull	%ebp
   367  	M4_inst	%esi, disp1(%edi)
   368  	adcl	%eax, %ecx
   369  	movl	%edx, %esi
   370  	jadcl0(	%esi)
   371  ')
   372  
   373  	decl	VAR_COUNTER
   374  
   375  	leal	UNROLL_BYTES(%ebx), %ebx
   376  	jns	L(top)
   377  
   378  
   379  	popl	%ebp
   380  	M4_inst	%ecx, UNROLL_BYTES(%edi)
   381  
   382  	popl	%edi
   383  	movl	%esi, %eax
   384  
   385  	popl	%ebx
   386  	jadcl0(	%eax)
   387  
   388  	popl	%esi
   389  	ret
   390  
   391  EPILOGUE()