github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/p6/mul_basecase.asm (about)

     1  dnl  Intel P6 mpn_mul_basecase -- multiply two mpn numbers.
     2  
     3  dnl  Copyright 1999-2003 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C P6: approx 6.5 cycles per cross product (16 limbs/loop unrolling).
    35  
    36  
    37  dnl  P6 UNROLL_COUNT cycles/product (approx)
    38  dnl           8           7
    39  dnl          16           6.5
    40  dnl          32           6.4
    41  dnl  Maximum possible with the current code is 32.
    42  
    43  deflit(UNROLL_COUNT, 16)
    44  
    45  
    46  C void mpn_mul_basecase (mp_ptr wp,
    47  C                        mp_srcptr xp, mp_size_t xsize,
    48  C                        mp_srcptr yp, mp_size_t ysize);
    49  C
    50  C This routine is essentially the same as mpn/generic/mul_basecase.c, but
    51  C it's faster because it does most of the mpn_addmul_1() startup
    52  C calculations only once.
    53  
    54  ifdef(`PIC',`
    55  deflit(UNROLL_THRESHOLD, 5)
    56  ',`
    57  deflit(UNROLL_THRESHOLD, 5)
    58  ')
    59  
    60  defframe(PARAM_YSIZE,20)
    61  defframe(PARAM_YP,   16)
    62  defframe(PARAM_XSIZE,12)
    63  defframe(PARAM_XP,   8)
    64  defframe(PARAM_WP,   4)
    65  
    66  	TEXT
    67  	ALIGN(16)
    68  
    69  PROLOGUE(mpn_mul_basecase)
    70  deflit(`FRAME',0)
    71  
    72  	movl	PARAM_XSIZE, %ecx
    73  
    74  	movl	PARAM_YP, %eax
    75  
    76  	movl	PARAM_XP, %edx
    77  
    78  	movl	(%eax), %eax		C yp[0]
    79  	cmpl	$2, %ecx
    80  	ja	L(xsize_more_than_two)
    81  	je	L(two_by_something)
    82  
    83  
    84  	C one limb by one limb
    85  
    86  	mull	(%edx)
    87  
    88  	movl	PARAM_WP, %ecx
    89  	movl	%eax, (%ecx)
    90  	movl	%edx, 4(%ecx)
    91  	ret
    92  
    93  
    94  C -----------------------------------------------------------------------------
    95  L(two_by_something):
    96  deflit(`FRAME',0)
    97  
    98  dnl  re-use parameter space
    99  define(SAVE_EBX, `PARAM_XSIZE')
   100  define(SAVE_ESI, `PARAM_YSIZE')
   101  
   102  	movl	%ebx, SAVE_EBX
   103  	cmpl	$1, PARAM_YSIZE
   104  	movl	%eax, %ecx		C yp[0]
   105  
   106  	movl	%esi, SAVE_ESI		C save esi
   107  	movl	PARAM_WP, %ebx
   108  	movl	%edx, %esi		C xp
   109  
   110  	movl	(%edx), %eax		C xp[0]
   111  	jne	L(two_by_two)
   112  
   113  
   114  	C two limbs by one limb
   115  	C
   116  	C eax	xp[0]
   117  	C ebx	wp
   118  	C ecx	yp[0]
   119  	C edx
   120  	C esi	xp
   121  
   122  	mull	%ecx
   123  
   124  	movl	%eax, (%ebx)
   125  	movl	4(%esi), %eax
   126  	movl	%edx, %esi		C carry
   127  
   128  	mull	%ecx
   129  
   130  	addl	%eax, %esi
   131  
   132  	movl	%esi, 4(%ebx)
   133  	movl	SAVE_ESI, %esi
   134  
   135  	adcl	$0, %edx
   136  
   137  	movl	%edx, 8(%ebx)
   138  	movl	SAVE_EBX, %ebx
   139  
   140  	ret
   141  
   142  
   143  
   144  C -----------------------------------------------------------------------------
   145  
   146  	ALIGN(16)
   147  L(two_by_two):
   148  	C eax	xp[0]
   149  	C ebx	wp
   150  	C ecx	yp[0]
   151  	C edx
   152  	C esi	xp
   153  	C edi
   154  	C ebp
   155  
   156  dnl  more parameter space re-use
   157  define(SAVE_EDI, `PARAM_WP')
   158  
   159  	mull	%ecx		C xp[0] * yp[0]
   160  
   161  	movl	%edi, SAVE_EDI
   162  	movl	%edx, %edi	C carry, for wp[1]
   163  
   164  	movl	%eax, (%ebx)
   165  	movl	4(%esi), %eax
   166  
   167  	mull	%ecx		C xp[1] * yp[0]
   168  
   169  	addl	%eax, %edi
   170  	movl	PARAM_YP, %ecx
   171  
   172  	adcl	$0, %edx
   173  	movl	4(%ecx), %ecx	C yp[1]
   174  
   175  	movl	%edi, 4(%ebx)
   176  	movl	4(%esi), %eax	C xp[1]
   177  	movl	%edx, %edi	C carry, for wp[2]
   178  
   179  	mull	%ecx		C xp[1] * yp[1]
   180  
   181  	addl	%eax, %edi
   182  	movl	(%esi), %eax	C xp[0]
   183  
   184  	adcl	$0, %edx
   185  	movl	%edx, %esi	C carry, for wp[3]
   186  
   187  	mull	%ecx		C xp[0] * yp[1]
   188  
   189  	addl	%eax, 4(%ebx)
   190  	movl	%esi, %eax
   191  
   192  	adcl	%edx, %edi
   193  	movl	SAVE_ESI, %esi
   194  
   195  	movl	%edi, 8(%ebx)
   196  
   197  	adcl	$0, %eax
   198  	movl	SAVE_EDI, %edi
   199  
   200  	movl	%eax, 12(%ebx)
   201  	movl	SAVE_EBX, %ebx
   202  
   203  	ret
   204  
   205  
   206  C -----------------------------------------------------------------------------
   207  	ALIGN(16)
   208  L(xsize_more_than_two):
   209  
   210  C The first limb of yp is processed with a simple mpn_mul_1 loop running at
   211  C about 6.2 c/l.  Unrolling this doesn't seem worthwhile since it's only run
   212  C once (whereas the addmul_1 below is run ysize-1 many times).  A call to
   213  C mpn_mul_1 would be slowed down by the parameter pushing and popping etc,
   214  C and doesn't seem likely to be worthwhile on the typical sizes reaching
   215  C here from the Karatsuba code.
   216  
   217  	C eax	yp[0]
   218  	C ebx
   219  	C ecx	xsize
   220  	C edx	xp
   221  	C esi
   222  	C edi
   223  	C ebp
   224  
   225  defframe(`SAVE_EBX',    -4)
   226  defframe(`SAVE_ESI',    -8)
   227  defframe(`SAVE_EDI',   -12)
   228  defframe(`SAVE_EBP',   -16)
   229  defframe(VAR_COUNTER,  -20)  dnl for use in the unroll case
   230  defframe(VAR_ADJUST,   -24)
   231  defframe(VAR_JMP,      -28)
   232  defframe(VAR_SWAP,     -32)
   233  defframe(VAR_XP_LOW,   -36)
   234  deflit(STACK_SPACE, 36)
   235  
   236  	subl	$STACK_SPACE, %esp
   237  deflit(`FRAME',STACK_SPACE)
   238  
   239  	movl	%edi, SAVE_EDI
   240  	movl	PARAM_WP, %edi
   241  
   242  	movl	%ebx, SAVE_EBX
   243  
   244  	movl	%ebp, SAVE_EBP
   245  	movl	%eax, %ebp
   246  
   247  	movl	%esi, SAVE_ESI
   248  	xorl	%ebx, %ebx
   249  	leal	(%edx,%ecx,4), %esi	C xp end
   250  
   251  	leal	(%edi,%ecx,4), %edi	C wp end of mul1
   252  	negl	%ecx
   253  
   254  
   255  L(mul1):
   256  	C eax	scratch
   257  	C ebx	carry
   258  	C ecx	counter, negative
   259  	C edx	scratch
   260  	C esi	xp end
   261  	C edi	wp end of mul1
   262  	C ebp	multiplier
   263  
   264  	movl	(%esi,%ecx,4), %eax
   265  
   266  	mull	%ebp
   267  
   268  	addl	%ebx, %eax
   269  	movl	%eax, (%edi,%ecx,4)
   270  	movl	$0, %ebx
   271  
   272  	adcl	%edx, %ebx
   273  	incl	%ecx
   274  	jnz	L(mul1)
   275  
   276  
   277  	movl	PARAM_YSIZE, %edx
   278  
   279  	movl	%ebx, (%edi)		C final carry
   280  	movl	PARAM_XSIZE, %ecx
   281  	decl	%edx
   282  
   283  	jz	L(done)			C if ysize==1
   284  
   285  	cmpl	$UNROLL_THRESHOLD, %ecx
   286  	movl	PARAM_YP, %eax
   287  	jae	L(unroll)
   288  
   289  
   290  C -----------------------------------------------------------------------------
   291  	C simple addmul looping
   292  	C
   293  	C eax	yp
   294  	C ebx
   295  	C ecx	xsize
   296  	C edx	ysize-1
   297  	C esi	xp end
   298  	C edi	wp end of mul1
   299  	C ebp
   300  
   301  	leal	4(%eax,%edx,4), %ebp	C yp end
   302  	negl	%ecx
   303  	negl	%edx
   304  
   305  	movl	%edx, PARAM_YSIZE	C -(ysize-1)
   306  	movl	(%esi,%ecx,4), %eax	C xp low limb
   307  	incl	%ecx
   308  
   309  	movl	%ecx, PARAM_XSIZE	C -(xsize-1)
   310  	xorl	%ebx, %ebx		C initial carry
   311  
   312  	movl	%ebp, PARAM_YP
   313  	movl	(%ebp,%edx,4), %ebp	C yp second lowest limb - multiplier
   314  	jmp	L(simple_outer_entry)
   315  
   316  
   317  L(simple_outer_top):
   318  	C ebp	ysize counter, negative
   319  
   320  	movl	PARAM_YP, %edx
   321  
   322  	movl	PARAM_XSIZE, %ecx	C -(xsize-1)
   323  	xorl	%ebx, %ebx		C carry
   324  
   325  	movl	%ebp, PARAM_YSIZE
   326  	addl	$4, %edi		C next position in wp
   327  
   328  	movl	(%edx,%ebp,4), %ebp	C yp limb - multiplier
   329  
   330  	movl	-4(%esi,%ecx,4), %eax	C xp low limb
   331  
   332  
   333  L(simple_outer_entry):
   334  
   335  L(simple_inner_top):
   336  	C eax	xp limb
   337  	C ebx	carry limb
   338  	C ecx	loop counter (negative)
   339  	C edx	scratch
   340  	C esi	xp end
   341  	C edi	wp end
   342  	C ebp	multiplier
   343  
   344  	mull	%ebp
   345  
   346  	addl	%eax, %ebx
   347  	adcl	$0, %edx
   348  
   349  	addl	%ebx, (%edi,%ecx,4)
   350  	movl	(%esi,%ecx,4), %eax
   351  	adcl	$0, %edx
   352  
   353  	incl	%ecx
   354  	movl	%edx, %ebx
   355  	jnz	L(simple_inner_top)
   356  
   357  
   358  	C separate code for last limb so outer loop counter handling can be
   359  	C interleaved
   360  
   361  	mull	%ebp
   362  
   363  	movl	PARAM_YSIZE, %ebp
   364  	addl	%eax, %ebx
   365  
   366  	adcl	$0, %edx
   367  
   368  	addl	%ebx, (%edi)
   369  
   370  	adcl	$0, %edx
   371  	incl	%ebp
   372  
   373  	movl	%edx, 4(%edi)
   374  	jnz	L(simple_outer_top)
   375  
   376  
   377  L(done):
   378  	movl	SAVE_EBX, %ebx
   379  
   380  	movl	SAVE_ESI, %esi
   381  
   382  	movl	SAVE_EDI, %edi
   383  
   384  	movl	SAVE_EBP, %ebp
   385  	addl	$FRAME, %esp
   386  
   387  	ret
   388  
   389  
   390  
   391  C -----------------------------------------------------------------------------
   392  C
   393  C The unrolled loop is the same as in mpn_addmul_1, see that code for some
   394  C comments.
   395  C
   396  C VAR_ADJUST is the negative of how many limbs the leals in the inner loop
   397  C increment xp and wp.  This is used to adjust xp and wp, and is rshifted to
   398  C given an initial VAR_COUNTER at the top of the outer loop.
   399  C
   400  C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT
   401  C up to -1, inclusive.
   402  C
   403  C VAR_JMP is the computed jump into the unrolled loop.
   404  C
   405  C VAR_SWAP is 0 if xsize odd or 0xFFFFFFFF if xsize even, used to swap the
   406  C initial ebx and ecx on entry to the unrolling.
   407  C
   408  C VAR_XP_LOW is the least significant limb of xp, which is needed at the
   409  C start of the unrolled loop.
   410  C
   411  C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
   412  C inclusive.
   413  C
   414  C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
   415  C added to give the location of the next limb of yp, which is the multiplier
   416  C in the unrolled loop.
   417  C
   418  C The trick with the VAR_ADJUST value means it's only necessary to do one
   419  C fetch in the outer loop to take care of xp, wp and the inner loop counter.
   420  
   421  
   422  L(unroll):
   423  	C eax	yp
   424  	C ebx
   425  	C ecx	xsize
   426  	C edx	ysize-1
   427  	C esi	xp end
   428  	C edi	wp end of mul1
   429  	C ebp
   430  
   431  	movl	PARAM_XP, %esi
   432  
   433  	movl	4(%eax), %ebp		C multiplier (yp second limb)
   434  	leal	4(%eax,%edx,4), %eax	C yp adjust for ysize indexing
   435  
   436  	movl	%eax, PARAM_YP
   437  	movl	PARAM_WP, %edi
   438  	negl	%edx
   439  
   440  	movl	%edx, PARAM_YSIZE
   441  	leal	UNROLL_COUNT-2(%ecx), %ebx	C (xsize-1)+UNROLL_COUNT-1
   442  	decl	%ecx				C xsize-1
   443  
   444  	movl	(%esi), %eax		C xp low limb
   445  	andl	$-UNROLL_MASK-1, %ebx
   446  	negl	%ecx			C -(xsize-1)
   447  
   448  	negl	%ebx
   449  	andl	$UNROLL_MASK, %ecx
   450  
   451  	movl	%ebx, VAR_ADJUST
   452  	movl	%ecx, %edx
   453  	shll	$4, %ecx
   454  
   455  	movl	%eax, VAR_XP_LOW
   456  	sarl	$UNROLL_LOG2, %ebx
   457  	negl	%edx
   458  
   459  	C 15 code bytes per limb
   460  ifdef(`PIC',`
   461  	call	L(pic_calc)
   462  L(unroll_here):
   463  ',`
   464  	leal	L(unroll_inner_entry) (%ecx,%edx,1), %ecx
   465  ')
   466  
   467  	movl	%ecx, VAR_JMP
   468  	movl	%edx, %ecx
   469  	shll	$31, %edx
   470  
   471  	sarl	$31, %edx		C 0 or -1 as xsize odd or even
   472  	leal	4(%edi,%ecx,4), %edi	C wp and xp, adjust for unrolling,
   473  	leal	4(%esi,%ecx,4), %esi	C  and start at second limb
   474  
   475  	movl	%edx, VAR_SWAP
   476  	jmp	L(unroll_outer_entry)
   477  
   478  
   479  ifdef(`PIC',`
   480  L(pic_calc):
   481  	C See mpn/x86/README about old gas bugs
   482  	leal	(%ecx,%edx,1), %ecx
   483  	addl	$L(unroll_inner_entry)-L(unroll_here), %ecx
   484  	addl	(%esp), %ecx
   485  	ret_internal
   486  ')
   487  
   488  
   489  C --------------------------------------------------------------------------
   490  	ALIGN(16)
   491  L(unroll_outer_top):
   492  	C eax
   493  	C ebx
   494  	C ecx
   495  	C edx
   496  	C esi	xp + offset
   497  	C edi	wp + offset
   498  	C ebp	ysize counter, negative
   499  
   500  	movl	VAR_ADJUST, %ebx
   501  	movl	PARAM_YP, %edx
   502  
   503  	movl	VAR_XP_LOW, %eax
   504  	movl	%ebp, PARAM_YSIZE	C store incremented ysize counter
   505  
   506  	leal	eval(UNROLL_BYTES + 4) (%edi,%ebx,4), %edi
   507  	leal	(%esi,%ebx,4), %esi
   508  	sarl	$UNROLL_LOG2, %ebx
   509  
   510  	movl	(%edx,%ebp,4), %ebp	C yp next multiplier
   511  
   512  L(unroll_outer_entry):
   513  	mull	%ebp
   514  
   515  	movl	%ebx, VAR_COUNTER
   516  	movl	%edx, %ebx		C carry high
   517  	movl	%eax, %ecx		C carry low
   518  
   519  	xorl	%edx, %eax
   520  	movl	VAR_JMP, %edx
   521  
   522  	andl	VAR_SWAP, %eax
   523  
   524  	xorl	%eax, %ebx		C carries other way for odd index
   525  	xorl	%eax, %ecx
   526  
   527  	jmp	*%edx
   528  
   529  
   530  C -----------------------------------------------------------------------------
   531  
   532  L(unroll_inner_top):
   533  	C eax	xp limb
   534  	C ebx	carry high
   535  	C ecx	carry low
   536  	C edx	scratch
   537  	C esi	xp+8
   538  	C edi	wp
   539  	C ebp	yp multiplier limb
   540  	C
   541  	C VAR_COUNTER  loop counter, negative
   542  	C
   543  	C 15 bytes each limb
   544  
   545  	addl	$UNROLL_BYTES, %edi
   546  
   547  L(unroll_inner_entry):
   548  
   549  deflit(CHUNK_COUNT,2)
   550  forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
   551  	deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
   552  	deflit(`disp1', eval(disp0 + 4))
   553  
   554  Zdisp(	movl,	disp0,(%esi), %eax)
   555  	mull	%ebp
   556  Zdisp(	addl,	%ecx, disp0,(%edi))
   557  	adcl	%eax, %ebx		C new carry low
   558  	movl	%edx, %ecx
   559  	adcl	$0, %ecx		C new carry high
   560  
   561  	movl	disp1(%esi), %eax
   562  	mull	%ebp
   563  	addl	%ebx, disp1(%edi)
   564  	adcl	%eax, %ecx		C new carry low
   565  	movl	%edx, %ebx
   566  	adcl	$0, %ebx		C new carry high
   567  ')
   568  
   569  
   570  	incl	VAR_COUNTER
   571  	leal	UNROLL_BYTES(%esi), %esi
   572  	jnz	L(unroll_inner_top)
   573  
   574  
   575  	C eax
   576  	C ebx	carry high
   577  	C ecx	carry low
   578  	C edx
   579  	C esi
   580  	C edi	wp, pointing at second last limb)
   581  	C ebp
   582  
   583  deflit(`disp0',	eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128)))
   584  deflit(`disp1', eval(disp0 + 4))
   585  
   586  	movl	PARAM_YSIZE, %ebp
   587  	addl	%ecx, disp0(%edi)	C carry low
   588  
   589  	adcl	$0, %ebx
   590  	incl	%ebp
   591  
   592  	movl	%ebx, disp1(%edi)	C carry high
   593  	jnz	L(unroll_outer_top)
   594  
   595  
   596  	movl	SAVE_ESI, %esi
   597  
   598  	movl	SAVE_EBP, %ebp
   599  
   600  	movl	SAVE_EDI, %edi
   601  
   602  	movl	SAVE_EBX, %ebx
   603  	addl	$FRAME, %esp
   604  
   605  	ret
   606  
   607  EPILOGUE()