github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k7/mul_basecase.asm (about)

     1  dnl  AMD K7 mpn_mul_basecase -- multiply two mpn numbers.
     2  
     3  dnl  Copyright 1999-2002 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C K7: approx 4.42 cycles per cross product at around 20x20 limbs (16
    35  C     limbs/loop unrolling).
    36  
    37  
    38  
    39  dnl  K7 UNROLL_COUNT cycles/product (at around 20x20)
    40  dnl           8           4.67
    41  dnl          16           4.59
    42  dnl          32           4.42
    43  dnl  Maximum possible with the current code is 32.
    44  dnl
    45  dnl  At 32 the typical 13-26 limb sizes from the karatsuba code will get
    46  dnl  done with a straight run through a block of code, no inner loop.  Using
    47  dnl  32 gives 1k of code, but the k7 has a 64k L1 code cache.
    48  
    49  deflit(UNROLL_COUNT, 32)
    50  
    51  
    52  C void mpn_mul_basecase (mp_ptr wp,
    53  C                        mp_srcptr xp, mp_size_t xsize,
    54  C                        mp_srcptr yp, mp_size_t ysize);
    55  C
    56  C Calculate xp,xsize multiplied by yp,ysize, storing the result in
    57  C wp,xsize+ysize.
    58  C
    59  C This routine is essentially the same as mpn/generic/mul_basecase.c, but
    60  C it's faster because it does most of the mpn_addmul_1() startup
    61  C calculations only once.  The saving is 15-25% on typical sizes coming from
    62  C the Karatsuba multiply code.
    63  
    64  ifdef(`PIC',`
    65  deflit(UNROLL_THRESHOLD, 5)
    66  ',`
    67  deflit(UNROLL_THRESHOLD, 5)
    68  ')
    69  
    70  defframe(PARAM_YSIZE,20)
    71  defframe(PARAM_YP,   16)
    72  defframe(PARAM_XSIZE,12)
    73  defframe(PARAM_XP,   8)
    74  defframe(PARAM_WP,   4)
    75  
    76  	TEXT
    77  	ALIGN(32)
    78  PROLOGUE(mpn_mul_basecase)
    79  deflit(`FRAME',0)
    80  
    81  	movl	PARAM_XSIZE, %ecx
    82  	movl	PARAM_YP, %eax
    83  
    84  	movl	PARAM_XP, %edx
    85  	movl	(%eax), %eax	C yp low limb
    86  
    87  	cmpl	$2, %ecx
    88  	ja	L(xsize_more_than_two)
    89  	je	L(two_by_something)
    90  
    91  
    92  	C one limb by one limb
    93  
    94  	mull	(%edx)
    95  
    96  	movl	PARAM_WP, %ecx
    97  	movl	%eax, (%ecx)
    98  	movl	%edx, 4(%ecx)
    99  	ret
   100  
   101  
   102  C -----------------------------------------------------------------------------
   103  L(two_by_something):
   104  deflit(`FRAME',0)
   105  	decl	PARAM_YSIZE
   106  	pushl	%ebx		defframe_pushl(`SAVE_EBX')
   107  	movl	%eax, %ecx	C yp low limb
   108  
   109  	movl	PARAM_WP, %ebx
   110  	pushl	%esi		defframe_pushl(`SAVE_ESI')
   111  	movl	%edx, %esi	C xp
   112  
   113  	movl	(%edx), %eax	C xp low limb
   114  	jnz	L(two_by_two)
   115  
   116  
   117  	C two limbs by one limb
   118  
   119  	mull	%ecx
   120  
   121  	movl	%eax, (%ebx)
   122  	movl	4(%esi), %eax
   123  	movl	%edx, %esi	C carry
   124  
   125  	mull	%ecx
   126  
   127  	addl	%eax, %esi
   128  
   129  	movl	%esi, 4(%ebx)
   130  	movl	SAVE_ESI, %esi
   131  
   132  	adcl	$0, %edx
   133  
   134  	movl	%edx, 8(%ebx)
   135  	movl	SAVE_EBX, %ebx
   136  	addl	$FRAME, %esp
   137  
   138  	ret
   139  
   140  
   141  
   142  C -----------------------------------------------------------------------------
   143  C Could load yp earlier into another register.
   144  
   145  	ALIGN(16)
   146  L(two_by_two):
   147  	C eax	xp low limb
   148  	C ebx	wp
   149  	C ecx	yp low limb
   150  	C edx
   151  	C esi	xp
   152  	C edi
   153  	C ebp
   154  
   155  dnl  FRAME carries on from previous
   156  
   157  	mull	%ecx		C xp[0] * yp[0]
   158  
   159  	push	%edi		defframe_pushl(`SAVE_EDI')
   160  	movl	%edx, %edi	C carry, for wp[1]
   161  
   162  	movl	%eax, (%ebx)
   163  	movl	4(%esi), %eax
   164  
   165  	mull	%ecx		C xp[1] * yp[0]
   166  
   167  	addl	%eax, %edi
   168  	movl	PARAM_YP, %ecx
   169  
   170  	adcl	$0, %edx
   171  	movl	4(%ecx), %ecx	C yp[1]
   172  	movl	%edi, 4(%ebx)
   173  
   174  	movl	4(%esi), %eax	C xp[1]
   175  	movl	%edx, %edi	C carry, for wp[2]
   176  
   177  	mull	%ecx		C xp[1] * yp[1]
   178  
   179  	addl	%eax, %edi
   180  
   181  	adcl	$0, %edx
   182  	movl	(%esi), %eax	C xp[0]
   183  
   184  	movl	%edx, %esi	C carry, for wp[3]
   185  
   186  	mull	%ecx		C xp[0] * yp[1]
   187  
   188  	addl	%eax, 4(%ebx)
   189  	adcl	%edx, %edi
   190  	movl	%edi, 8(%ebx)
   191  
   192  	adcl	$0, %esi
   193  	movl	SAVE_EDI, %edi
   194  	movl	%esi, 12(%ebx)
   195  
   196  	movl	SAVE_ESI, %esi
   197  	movl	SAVE_EBX, %ebx
   198  	addl	$FRAME, %esp
   199  
   200  	ret
   201  
   202  
   203  C -----------------------------------------------------------------------------
   204  	ALIGN(16)
   205  L(xsize_more_than_two):
   206  
   207  C The first limb of yp is processed with a simple mpn_mul_1 style loop
   208  C inline.  Unrolling this doesn't seem worthwhile since it's only run once
   209  C (whereas the addmul below is run ysize-1 many times).  A call to the
   210  C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
   211  C popping, and doesn't seem likely to be worthwhile on the typical 13-26
   212  C limb operations the Karatsuba code calls here with.
   213  
   214  	C eax	yp[0]
   215  	C ebx
   216  	C ecx	xsize
   217  	C edx	xp
   218  	C esi
   219  	C edi
   220  	C ebp
   221  
   222  dnl  FRAME doesn't carry on from previous, no pushes yet here
   223  defframe(`SAVE_EBX',-4)
   224  defframe(`SAVE_ESI',-8)
   225  defframe(`SAVE_EDI',-12)
   226  defframe(`SAVE_EBP',-16)
   227  deflit(`FRAME',0)
   228  
   229  	subl	$16, %esp
   230  deflit(`FRAME',16)
   231  
   232  	movl	%edi, SAVE_EDI
   233  	movl	PARAM_WP, %edi
   234  
   235  	movl	%ebx, SAVE_EBX
   236  	movl	%ebp, SAVE_EBP
   237  	movl	%eax, %ebp
   238  
   239  	movl	%esi, SAVE_ESI
   240  	xorl	%ebx, %ebx
   241  	leal	(%edx,%ecx,4), %esi	C xp end
   242  
   243  	leal	(%edi,%ecx,4), %edi	C wp end of mul1
   244  	negl	%ecx
   245  
   246  
   247  L(mul1):
   248  	C eax	scratch
   249  	C ebx	carry
   250  	C ecx	counter, negative
   251  	C edx	scratch
   252  	C esi	xp end
   253  	C edi	wp end of mul1
   254  	C ebp	multiplier
   255  
   256  	movl	(%esi,%ecx,4), %eax
   257  
   258  	mull	%ebp
   259  
   260  	addl	%ebx, %eax
   261  	movl	%eax, (%edi,%ecx,4)
   262  	movl	$0, %ebx
   263  
   264  	adcl	%edx, %ebx
   265  	incl	%ecx
   266  	jnz	L(mul1)
   267  
   268  
   269  	movl	PARAM_YSIZE, %edx
   270  	movl	PARAM_XSIZE, %ecx
   271  
   272  	movl	%ebx, (%edi)		C final carry
   273  	decl	%edx
   274  
   275  	jnz	L(ysize_more_than_one)
   276  
   277  
   278  	movl	SAVE_EDI, %edi
   279  	movl	SAVE_EBX, %ebx
   280  
   281  	movl	SAVE_EBP, %ebp
   282  	movl	SAVE_ESI, %esi
   283  	addl	$FRAME, %esp
   284  
   285  	ret
   286  
   287  
   288  L(ysize_more_than_one):
   289  	cmpl	$UNROLL_THRESHOLD, %ecx
   290  	movl	PARAM_YP, %eax
   291  
   292  	jae	L(unroll)
   293  
   294  
   295  C -----------------------------------------------------------------------------
   296  	C simple addmul looping
   297  	C
   298  	C eax	yp
   299  	C ebx
   300  	C ecx	xsize
   301  	C edx	ysize-1
   302  	C esi	xp end
   303  	C edi	wp end of mul1
   304  	C ebp
   305  
   306  	leal	4(%eax,%edx,4), %ebp	C yp end
   307  	negl	%ecx
   308  	negl	%edx
   309  
   310  	movl	(%esi,%ecx,4), %eax	C xp low limb
   311  	movl	%edx, PARAM_YSIZE	C -(ysize-1)
   312  	incl	%ecx
   313  
   314  	xorl	%ebx, %ebx		C initial carry
   315  	movl	%ecx, PARAM_XSIZE	C -(xsize-1)
   316  	movl	%ebp, PARAM_YP
   317  
   318  	movl	(%ebp,%edx,4), %ebp	C yp second lowest limb - multiplier
   319  	jmp	L(simple_outer_entry)
   320  
   321  
   322  	C this is offset 0x121 so close enough to aligned
   323  L(simple_outer_top):
   324  	C ebp	ysize counter, negative
   325  
   326  	movl	PARAM_YP, %edx
   327  	movl	PARAM_XSIZE, %ecx	C -(xsize-1)
   328  	xorl	%ebx, %ebx		C carry
   329  
   330  	movl	%ebp, PARAM_YSIZE
   331  	addl	$4, %edi		C next position in wp
   332  
   333  	movl	(%edx,%ebp,4), %ebp	C yp limb - multiplier
   334  	movl	-4(%esi,%ecx,4), %eax	C xp low limb
   335  
   336  
   337  L(simple_outer_entry):
   338  
   339  L(simple_inner):
   340  	C eax	xp limb
   341  	C ebx	carry limb
   342  	C ecx	loop counter (negative)
   343  	C edx	scratch
   344  	C esi	xp end
   345  	C edi	wp end
   346  	C ebp	multiplier
   347  
   348  	mull	%ebp
   349  
   350  	addl	%eax, %ebx
   351  	adcl	$0, %edx
   352  
   353  	addl	%ebx, (%edi,%ecx,4)
   354  	movl	(%esi,%ecx,4), %eax
   355  	adcl	$0, %edx
   356  
   357  	incl	%ecx
   358  	movl	%edx, %ebx
   359  	jnz	L(simple_inner)
   360  
   361  
   362  	mull	%ebp
   363  
   364  	movl	PARAM_YSIZE, %ebp
   365  	addl	%eax, %ebx
   366  
   367  	adcl	$0, %edx
   368  	addl	%ebx, (%edi)
   369  
   370  	adcl	$0, %edx
   371  	incl	%ebp
   372  
   373  	movl	%edx, 4(%edi)
   374  	jnz	L(simple_outer_top)
   375  
   376  
   377  	movl	SAVE_EBX, %ebx
   378  	movl	SAVE_ESI, %esi
   379  
   380  	movl	SAVE_EDI, %edi
   381  	movl	SAVE_EBP, %ebp
   382  	addl	$FRAME, %esp
   383  
   384  	ret
   385  
   386  
   387  
   388  C -----------------------------------------------------------------------------
   389  C
   390  C The unrolled loop is the same as in mpn_addmul_1(), see that code for some
   391  C comments.
   392  C
   393  C VAR_ADJUST is the negative of how many limbs the leals in the inner loop
   394  C increment xp and wp.  This is used to adjust back xp and wp, and rshifted
   395  C to given an initial VAR_COUNTER at the top of the outer loop.
   396  C
   397  C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT
   398  C up to -1, inclusive.
   399  C
   400  C VAR_JMP is the computed jump into the unrolled loop.
   401  C
   402  C VAR_XP_LOW is the least significant limb of xp, which is needed at the
   403  C start of the unrolled loop.
   404  C
   405  C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
   406  C inclusive.
   407  C
   408  C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
   409  C added to give the location of the next limb of yp, which is the multiplier
   410  C in the unrolled loop.
   411  C
   412  C The trick with VAR_ADJUST means it's only necessary to do one fetch in the
   413  C outer loop to take care of xp, wp and the inner loop counter.
   414  
   415  defframe(VAR_COUNTER,  -20)
   416  defframe(VAR_ADJUST,   -24)
   417  defframe(VAR_JMP,      -28)
   418  defframe(VAR_XP_LOW,   -32)
   419  deflit(VAR_EXTRA_SPACE, 16)
   420  
   421  
   422  L(unroll):
   423  	C eax	yp
   424  	C ebx
   425  	C ecx	xsize
   426  	C edx	ysize-1
   427  	C esi	xp end
   428  	C edi	wp end of mul1
   429  	C ebp
   430  
   431  	movl	PARAM_XP, %esi
   432  	movl	4(%eax), %ebp		C multiplier (yp second limb)
   433  	leal	4(%eax,%edx,4), %eax	C yp adjust for ysize indexing
   434  
   435  	movl	PARAM_WP, %edi
   436  	movl	%eax, PARAM_YP
   437  	negl	%edx
   438  
   439  	movl	%edx, PARAM_YSIZE
   440  	leal	UNROLL_COUNT-2(%ecx), %ebx	C (xsize-1)+UNROLL_COUNT-1
   441  	decl	%ecx				C xsize-1
   442  
   443  	movl	(%esi), %eax		C xp low limb
   444  	andl	$-UNROLL_MASK-1, %ebx
   445  	negl	%ecx
   446  
   447  	subl	$VAR_EXTRA_SPACE, %esp
   448  deflit(`FRAME',16+VAR_EXTRA_SPACE)
   449  	negl	%ebx
   450  	andl	$UNROLL_MASK, %ecx
   451  
   452  	movl	%ebx, VAR_ADJUST
   453  	movl	%ecx, %edx
   454  	shll	$4, %ecx
   455  
   456  	sarl	$UNROLL_LOG2, %ebx
   457  
   458  	C 17 code bytes per limb
   459  ifdef(`PIC',`
   460  	call	L(pic_calc)
   461  L(unroll_here):
   462  ',`
   463  	leal	L(unroll_entry) (%ecx,%edx,1), %ecx
   464  ')
   465  	negl	%edx
   466  
   467  	movl	%eax, VAR_XP_LOW
   468  	movl	%ecx, VAR_JMP
   469  	leal	4(%edi,%edx,4), %edi	C wp and xp, adjust for unrolling,
   470  	leal	4(%esi,%edx,4), %esi	C  and start at second limb
   471  	jmp	L(unroll_outer_entry)
   472  
   473  
   474  ifdef(`PIC',`
   475  L(pic_calc):
   476  	C See mpn/x86/README about old gas bugs
   477  	leal	(%ecx,%edx,1), %ecx
   478  	addl	$L(unroll_entry)-L(unroll_here), %ecx
   479  	addl	(%esp), %ecx
   480  	ret_internal
   481  ')
   482  
   483  
   484  C --------------------------------------------------------------------------
   485  	ALIGN(32)
   486  L(unroll_outer_top):
   487  	C ebp	ysize counter, negative
   488  
   489  	movl	VAR_ADJUST, %ebx
   490  	movl	PARAM_YP, %edx
   491  
   492  	movl	VAR_XP_LOW, %eax
   493  	movl	%ebp, PARAM_YSIZE	C store incremented ysize counter
   494  
   495  	leal	4(%edi,%ebx,4), %edi
   496  	leal	(%esi,%ebx,4), %esi
   497  	sarl	$UNROLL_LOG2, %ebx
   498  
   499  	movl	(%edx,%ebp,4), %ebp	C yp next multiplier
   500  	movl	VAR_JMP, %ecx
   501  
   502  L(unroll_outer_entry):
   503  	mull	%ebp
   504  
   505  	testb	$1, %cl		C and clear carry bit
   506  	movl	%ebx, VAR_COUNTER
   507  	movl	$0, %ebx
   508  
   509  	movl	$0, %ecx
   510  	cmovz(	%eax, %ecx)	C eax into low carry, zero into high carry limb
   511  	cmovnz(	%eax, %ebx)
   512  
   513  	C Extra fetch of VAR_JMP is bad, but registers are tight
   514  	jmp	*VAR_JMP
   515  
   516  
   517  C -----------------------------------------------------------------------------
   518  	ALIGN(32)
   519  L(unroll_top):
   520  	C eax	xp limb
   521  	C ebx	carry high
   522  	C ecx	carry low
   523  	C edx	scratch
   524  	C esi	xp+8
   525  	C edi	wp
   526  	C ebp	yp multiplier limb
   527  	C
   528  	C VAR_COUNTER  loop counter, negative
   529  	C
   530  	C 17 bytes each limb
   531  
   532  L(unroll_entry):
   533  
   534  deflit(CHUNK_COUNT,2)
   535  forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
   536  	deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
   537  	deflit(`disp1', eval(disp0 + 4))
   538  
   539  Zdisp(	movl,	disp0,(%esi), %eax)
   540  	adcl	%edx, %ebx
   541  
   542  	mull	%ebp
   543  
   544  Zdisp(	addl,	%ecx, disp0,(%edi))
   545  	movl	$0, %ecx
   546  
   547  	adcl	%eax, %ebx
   548  
   549  
   550  	movl	disp1(%esi), %eax
   551  	adcl	%edx, %ecx
   552  
   553  	mull	%ebp
   554  
   555  	addl	%ebx, disp1(%edi)
   556  	movl	$0, %ebx
   557  
   558  	adcl	%eax, %ecx
   559  ')
   560  
   561  
   562  	incl	VAR_COUNTER
   563  	leal	UNROLL_BYTES(%esi), %esi
   564  	leal	UNROLL_BYTES(%edi), %edi
   565  
   566  	jnz	L(unroll_top)
   567  
   568  
   569  	C eax
   570  	C ebx	zero
   571  	C ecx	low
   572  	C edx	high
   573  	C esi
   574  	C edi	wp, pointing at second last limb)
   575  	C ebp
   576  	C
   577  	C carry flag to be added to high
   578  
   579  deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
   580  deflit(`disp1', eval(disp0-0 + 4))
   581  
   582  	movl	PARAM_YSIZE, %ebp
   583  	adcl	$0, %edx
   584  	addl	%ecx, disp0(%edi)
   585  
   586  	adcl	$0, %edx
   587  	incl	%ebp
   588  
   589  	movl	%edx, disp1(%edi)
   590  	jnz	L(unroll_outer_top)
   591  
   592  
   593  	movl	SAVE_ESI, %esi
   594  	movl	SAVE_EBP, %ebp
   595  
   596  	movl	SAVE_EDI, %edi
   597  	movl	SAVE_EBX, %ebx
   598  	addl	$FRAME, %esp
   599  
   600  	ret
   601  
   602  EPILOGUE()