github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k6/mul_basecase.asm (about)

     1  dnl  AMD K6 mpn_mul_basecase -- multiply two mpn numbers.
     2  
     3  dnl  Copyright 1999-2003 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C K6: approx 9.0 cycles per cross product on 30x30 limbs (with 16 limbs/loop
    35  C     unrolling).
    36  
    37  
    38  
    39  dnl  K6: UNROLL_COUNT cycles/product (approx)
    40  dnl           8           9.75
    41  dnl          16           9.3
    42  dnl          32           9.3
    43  dnl  Maximum possible with the current code is 32.
    44  dnl
    45  dnl  With 16 the inner unrolled loop fits exactly in a 256 byte block, which
    46  dnl  might explain it's good performance.
    47  
    48  deflit(UNROLL_COUNT, 16)
    49  
    50  
    51  C void mpn_mul_basecase (mp_ptr wp,
    52  C                        mp_srcptr xp, mp_size_t xsize,
    53  C                        mp_srcptr yp, mp_size_t ysize);
    54  C
    55  C Calculate xp,xsize multiplied by yp,ysize, storing the result in
    56  C wp,xsize+ysize.
    57  C
    58  C This routine is essentially the same as mpn/generic/mul_basecase.c, but
    59  C it's faster because it does most of the mpn_addmul_1() entry code only
    60  C once.  The saving is about 10-20% on typical sizes coming from the
    61  C Karatsuba multiply code.
    62  C
    63  C Enhancements:
    64  C
    65  C The mul_1 loop is about 8.5 c/l, which is slower than mpn_mul_1 at 6.25
    66  C c/l.  Could call mpn_mul_1 when ysize is big enough to make it worthwhile.
    67  C
    68  C The main unrolled addmul loop could be shared by mpn_addmul_1, using some
    69  C extra stack setups and maybe 2 or 3 wasted cycles at the end.  Code saving
    70  C would be 256 bytes.
    71  
    72  ifdef(`PIC',`
    73  deflit(UNROLL_THRESHOLD, 8)
    74  ',`
    75  deflit(UNROLL_THRESHOLD, 8)
    76  ')
    77  
    78  defframe(PARAM_YSIZE,20)
    79  defframe(PARAM_YP,   16)
    80  defframe(PARAM_XSIZE,12)
    81  defframe(PARAM_XP,   8)
    82  defframe(PARAM_WP,   4)
    83  
    84  	TEXT
    85  	ALIGN(32)
    86  PROLOGUE(mpn_mul_basecase)
    87  deflit(`FRAME',0)
    88  
    89  	movl	PARAM_XSIZE, %ecx
    90  	movl	PARAM_YP, %eax
    91  
    92  	movl	PARAM_XP, %edx
    93  	movl	(%eax), %eax	C yp low limb
    94  
    95  	cmpl	$2, %ecx
    96  	ja	L(xsize_more_than_two_limbs)
    97  	je	L(two_by_something)
    98  
    99  
   100  	C one limb by one limb
   101  
   102  	movl	(%edx), %edx	C xp low limb
   103  	movl	PARAM_WP, %ecx
   104  
   105  	mull	%edx
   106  
   107  	movl	%eax, (%ecx)
   108  	movl	%edx, 4(%ecx)
   109  	ret
   110  
   111  
   112  C -----------------------------------------------------------------------------
   113  L(two_by_something):
   114  	decl	PARAM_YSIZE
   115  	pushl	%ebx
   116  deflit(`FRAME',4)
   117  
   118  	movl	PARAM_WP, %ebx
   119  	pushl	%esi
   120  deflit(`FRAME',8)
   121  
   122  	movl	%eax, %ecx	C yp low limb
   123  	movl	(%edx), %eax	C xp low limb
   124  
   125  	movl	%edx, %esi	C xp
   126  	jnz	L(two_by_two)
   127  
   128  
   129  	C two limbs by one limb
   130  
   131  	mull	%ecx
   132  
   133  	movl	%eax, (%ebx)
   134  	movl	4(%esi), %eax
   135  
   136  	movl	%edx, %esi	C carry
   137  
   138  	mull	%ecx
   139  
   140  	addl	%eax, %esi
   141  	movl	%esi, 4(%ebx)
   142  
   143  	adcl	$0, %edx
   144  
   145  	movl	%edx, 8(%ebx)
   146  	popl	%esi
   147  
   148  	popl	%ebx
   149  	ret
   150  
   151  
   152  
   153  C -----------------------------------------------------------------------------
   154  	ALIGN(16)
   155  L(two_by_two):
   156  	C eax	xp low limb
   157  	C ebx	wp
   158  	C ecx	yp low limb
   159  	C edx
   160  	C esi	xp
   161  	C edi
   162  	C ebp
   163  deflit(`FRAME',8)
   164  
   165  	mull	%ecx		C xp[0] * yp[0]
   166  
   167  	push	%edi
   168  deflit(`FRAME',12)
   169  	movl	%eax, (%ebx)
   170  
   171  	movl	4(%esi), %eax
   172  	movl	%edx, %edi	C carry, for wp[1]
   173  
   174  	mull	%ecx		C xp[1] * yp[0]
   175  
   176  	addl	%eax, %edi
   177  	movl	PARAM_YP, %ecx
   178  
   179  	adcl	$0, %edx
   180  
   181  	movl	%edi, 4(%ebx)
   182  	movl	4(%ecx), %ecx	C yp[1]
   183  
   184  	movl	4(%esi), %eax	C xp[1]
   185  	movl	%edx, %edi	C carry, for wp[2]
   186  
   187  	mull	%ecx		C xp[1] * yp[1]
   188  
   189  	addl	%eax, %edi
   190  
   191  	adcl	$0, %edx
   192  
   193  	movl	(%esi), %eax	C xp[0]
   194  	movl	%edx, %esi	C carry, for wp[3]
   195  
   196  	mull	%ecx		C xp[0] * yp[1]
   197  
   198  	addl	%eax, 4(%ebx)
   199  	adcl	%edx, %edi
   200  	adcl	$0, %esi
   201  
   202  	movl	%edi, 8(%ebx)
   203  	popl	%edi
   204  
   205  	movl	%esi, 12(%ebx)
   206  	popl	%esi
   207  
   208  	popl	%ebx
   209  	ret
   210  
   211  
   212  C -----------------------------------------------------------------------------
   213  	ALIGN(16)
   214  L(xsize_more_than_two_limbs):
   215  
   216  C The first limb of yp is processed with a simple mpn_mul_1 style loop
   217  C inline.  Unrolling this doesn't seem worthwhile since it's only run once
   218  C (whereas the addmul below is run ysize-1 many times).  A call to the
   219  C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
   220  C popping, and doesn't seem likely to be worthwhile on the typical 10-20
   221  C limb operations the Karatsuba code calls here with.
   222  
   223  	C eax	yp[0]
   224  	C ebx
   225  	C ecx	xsize
   226  	C edx	xp
   227  	C esi
   228  	C edi
   229  	C ebp
   230  deflit(`FRAME',0)
   231  
   232  	pushl	%edi		defframe_pushl(SAVE_EDI)
   233  	pushl	%ebp		defframe_pushl(SAVE_EBP)
   234  
   235  	movl	PARAM_WP, %edi
   236  	pushl	%esi		defframe_pushl(SAVE_ESI)
   237  
   238  	movl	%eax, %ebp
   239  	pushl	%ebx		defframe_pushl(SAVE_EBX)
   240  
   241  	leal	(%edx,%ecx,4), %ebx	C xp end
   242  	xorl	%esi, %esi
   243  
   244  	leal	(%edi,%ecx,4), %edi	C wp end of mul1
   245  	negl	%ecx
   246  
   247  
   248  L(mul1):
   249  	C eax	scratch
   250  	C ebx	xp end
   251  	C ecx	counter, negative
   252  	C edx	scratch
   253  	C esi	carry
   254  	C edi	wp end of mul1
   255  	C ebp	multiplier
   256  
   257  	movl	(%ebx,%ecx,4), %eax
   258  
   259  	mull	%ebp
   260  
   261  	addl	%esi, %eax
   262  	movl	$0, %esi
   263  
   264  	adcl	%edx, %esi
   265  
   266  	movl	%eax, (%edi,%ecx,4)
   267  	incl	%ecx
   268  
   269  	jnz	L(mul1)
   270  
   271  
   272  	movl	PARAM_YSIZE, %edx
   273  	movl	%esi, (%edi)		C final carry
   274  
   275  	movl	PARAM_XSIZE, %ecx
   276  	decl	%edx
   277  
   278  	jnz	L(ysize_more_than_one_limb)
   279  
   280  	popl	%ebx
   281  	popl	%esi
   282  	popl	%ebp
   283  	popl	%edi
   284  	ret
   285  
   286  
   287  L(ysize_more_than_one_limb):
   288  	cmpl	$UNROLL_THRESHOLD, %ecx
   289  	movl	PARAM_YP, %eax
   290  
   291  	jae	L(unroll)
   292  
   293  
   294  C -----------------------------------------------------------------------------
   295  C Simple addmul loop.
   296  C
   297  C Using ebx and edi pointing at the ends of their respective locations saves
   298  C a couple of instructions in the outer loop.  The inner loop is still 11
   299  C cycles, the same as the simple loop in aorsmul_1.asm.
   300  
   301  	C eax	yp
   302  	C ebx	xp end
   303  	C ecx	xsize
   304  	C edx	ysize-1
   305  	C esi
   306  	C edi	wp end of mul1
   307  	C ebp
   308  
   309  	movl	4(%eax), %ebp		C multiplier
   310  	negl	%ecx
   311  
   312  	movl	%ecx, PARAM_XSIZE	C -xsize
   313  	xorl	%esi, %esi		C initial carry
   314  
   315  	leal	4(%eax,%edx,4), %eax	C yp end
   316  	negl	%edx
   317  
   318  	movl	%eax, PARAM_YP
   319  	movl	%edx, PARAM_YSIZE
   320  
   321  	jmp	L(simple_outer_entry)
   322  
   323  
   324  	C aligning here saves a couple of cycles
   325  	ALIGN(16)
   326  L(simple_outer_top):
   327  	C edx	ysize counter, negative
   328  
   329  	movl	PARAM_YP, %eax		C yp end
   330  	xorl	%esi, %esi		C carry
   331  
   332  	movl	PARAM_XSIZE, %ecx	C -xsize
   333  	movl	%edx, PARAM_YSIZE
   334  
   335  	movl	(%eax,%edx,4), %ebp	C yp limb multiplier
   336  L(simple_outer_entry):
   337  	addl	$4, %edi
   338  
   339  
   340  L(simple_inner):
   341  	C eax	scratch
   342  	C ebx	xp end
   343  	C ecx	counter, negative
   344  	C edx	scratch
   345  	C esi	carry
   346  	C edi	wp end of this addmul
   347  	C ebp	multiplier
   348  
   349  	movl	(%ebx,%ecx,4), %eax
   350  
   351  	mull	%ebp
   352  
   353  	addl	%esi, %eax
   354  	movl	$0, %esi
   355  
   356  	adcl	$0, %edx
   357  	addl	%eax, (%edi,%ecx,4)
   358  	adcl	%edx, %esi
   359  
   360  	incl	%ecx
   361  	jnz	L(simple_inner)
   362  
   363  
   364  	movl	PARAM_YSIZE, %edx
   365  	movl	%esi, (%edi)
   366  
   367  	incl	%edx
   368  	jnz	L(simple_outer_top)
   369  
   370  
   371  	popl	%ebx
   372  	popl	%esi
   373  	popl	%ebp
   374  	popl	%edi
   375  	ret
   376  
   377  
   378  C -----------------------------------------------------------------------------
   379  C Unrolled loop.
   380  C
   381  C The unrolled inner loop is the same as in aorsmul_1.asm, see that code for
   382  C some comments.
   383  C
   384  C VAR_COUNTER is for the inner loop, running from VAR_COUNTER_INIT down to
   385  C 0, inclusive.
   386  C
   387  C VAR_JMP is the computed jump into the unrolled loop.
   388  C
   389  C PARAM_XP and PARAM_WP get offset appropriately for where the unrolled loop
   390  C is entered.
   391  C
   392  C VAR_XP_LOW is the least significant limb of xp, which is needed at the
   393  C start of the unrolled loop.  This can't just be fetched through the xp
   394  C pointer because of the offset applied to it.
   395  C
   396  C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
   397  C inclusive.
   398  C
   399  C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
   400  C added to give the location of the next limb of yp, which is the multiplier
   401  C in the unrolled loop.
   402  C
   403  C PARAM_WP is similarly offset so that the PARAM_YSIZE counter can be added
   404  C to give the starting point in the destination for each unrolled loop (this
   405  C point is one limb upwards for each limb of yp processed).
   406  C
   407  C Having PARAM_YSIZE count negative to zero means it's not necessary to
   408  C store new values of PARAM_YP and PARAM_WP on each loop.  Those values on
   409  C the stack remain constant and on each loop an leal adjusts them with the
   410  C PARAM_YSIZE counter value.
   411  
   412  
   413  defframe(VAR_COUNTER,      -20)
   414  defframe(VAR_COUNTER_INIT, -24)
   415  defframe(VAR_JMP,          -28)
   416  defframe(VAR_XP_LOW,       -32)
   417  deflit(VAR_STACK_SPACE, 16)
   418  
   419  dnl  For some strange reason using (%esp) instead of 0(%esp) is a touch
   420  dnl  slower in this code, hence the defframe empty-if-zero feature is
   421  dnl  disabled.
   422  dnl
   423  dnl  If VAR_COUNTER is at (%esp), the effect is worse.  In this case the
   424  dnl  unrolled loop is 255 instead of 256 bytes, but quite how this affects
   425  dnl  anything isn't clear.
   426  dnl
   427  define(`defframe_empty_if_zero_disabled',1)
   428  
   429  L(unroll):
   430  	C eax	yp (not used)
   431  	C ebx	xp end (not used)
   432  	C ecx	xsize
   433  	C edx	ysize-1
   434  	C esi
   435  	C edi	wp end of mul1 (not used)
   436  	C ebp
   437  deflit(`FRAME', 16)
   438  
   439  	leal	-2(%ecx), %ebp	C one limb processed at start,
   440  	decl	%ecx		C and ebp is one less
   441  
   442  	shrl	$UNROLL_LOG2, %ebp
   443  	negl	%ecx
   444  
   445  	subl	$VAR_STACK_SPACE, %esp
   446  deflit(`FRAME', 16+VAR_STACK_SPACE)
   447  	andl	$UNROLL_MASK, %ecx
   448  
   449  	movl	%ecx, %esi
   450  	shll	$4, %ecx
   451  
   452  	movl	%ebp, VAR_COUNTER_INIT
   453  	negl	%esi
   454  
   455  	C 15 code bytes per limb
   456  ifdef(`PIC',`
   457  	call	L(pic_calc)
   458  L(unroll_here):
   459  ',`
   460  	leal	L(unroll_entry) (%ecx,%esi,1), %ecx
   461  ')
   462  
   463  	movl	PARAM_XP, %ebx
   464  	movl	%ebp, VAR_COUNTER
   465  
   466  	movl	PARAM_WP, %edi
   467  	movl	%ecx, VAR_JMP
   468  
   469  	movl	(%ebx), %eax
   470  	leal	4(%edi,%esi,4), %edi	C wp adjust for unrolling and mul1
   471  
   472  	leal	(%ebx,%esi,4), %ebx	C xp adjust for unrolling
   473  
   474  	movl	%eax, VAR_XP_LOW
   475  
   476  	movl	%ebx, PARAM_XP
   477  	movl	PARAM_YP, %ebx
   478  
   479  	leal	(%edi,%edx,4), %ecx	C wp adjust for ysize indexing
   480  	movl	4(%ebx), %ebp		C multiplier (yp second limb)
   481  
   482  	leal	4(%ebx,%edx,4), %ebx	C yp adjust for ysize indexing
   483  
   484  	movl	%ecx, PARAM_WP
   485  
   486  	leal	1(%esi), %ecx	C adjust parity for decl %ecx above
   487  
   488  	movl	%ebx, PARAM_YP
   489  	negl	%edx
   490  
   491  	movl	%edx, PARAM_YSIZE
   492  	jmp	L(unroll_outer_entry)
   493  
   494  
   495  ifdef(`PIC',`
   496  L(pic_calc):
   497  	C See mpn/x86/README about old gas bugs
   498  	leal	(%ecx,%esi,1), %ecx
   499  	addl	$L(unroll_entry)-L(unroll_here), %ecx
   500  	addl	(%esp), %ecx
   501  	ret_internal
   502  ')
   503  
   504  
   505  C -----------------------------------------------------------------------------
   506  	C Aligning here saves a couple of cycles per loop.  Using 32 doesn't
   507  	C cost any extra space, since the inner unrolled loop below is
   508  	C aligned to 32.
   509  	ALIGN(32)
   510  L(unroll_outer_top):
   511  	C edx	ysize
   512  
   513  	movl	PARAM_YP, %eax
   514  	movl	%edx, PARAM_YSIZE	C incremented ysize counter
   515  
   516  	movl	PARAM_WP, %edi
   517  
   518  	movl	VAR_COUNTER_INIT, %ebx
   519  	movl	(%eax,%edx,4), %ebp	C next multiplier
   520  
   521  	movl	PARAM_XSIZE, %ecx
   522  	leal	(%edi,%edx,4), %edi	C adjust wp for where we are in yp
   523  
   524  	movl	VAR_XP_LOW, %eax
   525  	movl	%ebx, VAR_COUNTER
   526  
   527  L(unroll_outer_entry):
   528  	mull	%ebp
   529  
   530  	C using testb is a tiny bit faster than testl
   531  	testb	$1, %cl
   532  
   533  	movl	%eax, %ecx	C low carry
   534  	movl	VAR_JMP, %eax
   535  
   536  	movl	%edx, %esi	C high carry
   537  	movl	PARAM_XP, %ebx
   538  
   539  	jnz	L(unroll_noswap)
   540  	movl	%ecx, %esi	C high,low carry other way around
   541  
   542  	movl	%edx, %ecx
   543  L(unroll_noswap):
   544  
   545  	jmp	*%eax
   546  
   547  
   548  
   549  C -----------------------------------------------------------------------------
   550  	ALIGN(32)
   551  L(unroll_top):
   552  	C eax	scratch
   553  	C ebx	xp
   554  	C ecx	carry low
   555  	C edx	scratch
   556  	C esi	carry high
   557  	C edi	wp
   558  	C ebp	multiplier
   559  	C VAR_COUNTER  loop counter
   560  	C
   561  	C 15 code bytes each limb
   562  
   563  	leal	UNROLL_BYTES(%edi), %edi
   564  
   565  L(unroll_entry):
   566  deflit(CHUNK_COUNT,2)
   567  forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
   568  	deflit(`disp0', eval(i*CHUNK_COUNT*4))
   569  	deflit(`disp1', eval(disp0 + 4))
   570  	deflit(`disp2', eval(disp1 + 4))
   571  
   572  	movl	disp1(%ebx), %eax
   573  	mull	%ebp
   574  Zdisp(	addl,	%ecx, disp0,(%edi))
   575  	adcl	%eax, %esi
   576  	movl	%edx, %ecx
   577  	jadcl0( %ecx)
   578  
   579  	movl	disp2(%ebx), %eax
   580  	mull	%ebp
   581  	addl	%esi, disp1(%edi)
   582  	adcl	%eax, %ecx
   583  	movl	%edx, %esi
   584  	jadcl0( %esi)
   585  ')
   586  
   587  	decl	VAR_COUNTER
   588  	leal	UNROLL_BYTES(%ebx), %ebx
   589  
   590  	jns	L(unroll_top)
   591  
   592  
   593  	movl	PARAM_YSIZE, %edx
   594  	addl	%ecx, UNROLL_BYTES(%edi)
   595  
   596  	adcl	$0, %esi
   597  
   598  	incl	%edx
   599  	movl	%esi, UNROLL_BYTES+4(%edi)
   600  
   601  	jnz	L(unroll_outer_top)
   602  
   603  
   604  	movl	SAVE_ESI, %esi
   605  	movl	SAVE_EBP, %ebp
   606  	movl	SAVE_EDI, %edi
   607  	movl	SAVE_EBX, %ebx
   608  
   609  	addl	$FRAME, %esp
   610  	ret
   611  
   612  EPILOGUE()