github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k6/sqr_basecase.asm (about)

     1  dnl  AMD K6 mpn_sqr_basecase -- square an mpn number.
     2  
     3  dnl  Copyright 1999-2002 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C K6: approx 4.7 cycles per cross product, or 9.2 cycles per triangular
    35  C     product (measured on the speed difference between 17 and 33 limbs,
    36  C     which is roughly the Karatsuba recursing range).
    37  
    38  
    39  dnl  SQR_TOOM2_THRESHOLD_MAX is the maximum SQR_TOOM2_THRESHOLD this
    40  dnl  code supports.  This value is used only by the tune program to know
    41  dnl  what it can go up to.  (An attempt to compile with a bigger value will
    42  dnl  trigger some m4_assert()s in the code, making the build fail.)
    43  dnl
    44  dnl  The value is determined by requiring the displacements in the unrolled
    45  dnl  addmul to fit in single bytes.  This means a maximum UNROLL_COUNT of
    46  dnl  63, giving a maximum SQR_TOOM2_THRESHOLD of 66.
    47  
    48  deflit(SQR_TOOM2_THRESHOLD_MAX, 66)
    49  
    50  
    51  dnl  Allow a value from the tune program to override config.m4.
    52  
    53  ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE',
    54  `define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)')
    55  
    56  
    57  dnl  UNROLL_COUNT is the number of code chunks in the unrolled addmul.  The
    58  dnl  number required is determined by SQR_TOOM2_THRESHOLD, since
    59  dnl  mpn_sqr_basecase only needs to handle sizes < SQR_TOOM2_THRESHOLD.
    60  dnl
    61  dnl  The first addmul is the biggest, and this takes the second least
    62  dnl  significant limb and multiplies it by the third least significant and
    63  dnl  up.  Hence for a maximum operand size of SQR_TOOM2_THRESHOLD-1
    64  dnl  limbs, UNROLL_COUNT needs to be SQR_TOOM2_THRESHOLD-3.
    65  
    66  m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD')
    67  deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3))
    68  
    69  
    70  C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
    71  C
    72  C The algorithm is essentially the same as mpn/generic/sqr_basecase.c, but a
    73  C lot of function call overheads are avoided, especially when the given size
    74  C is small.
    75  C
    76  C The code size might look a bit excessive, but not all of it is executed
    77  C and so won't fill up the code cache.  The 1x1, 2x2 and 3x3 special cases
    78  C clearly apply only to those sizes; mid sizes like 10x10 only need part of
    79  C the unrolled addmul; and big sizes like 35x35 that do need all of it will
    80  C at least be getting value for money, because 35x35 spends something like
    81  C 5780 cycles here.
    82  C
    83  C Different values of UNROLL_COUNT give slightly different speeds, between
    84  C 9.0 and 9.2 c/tri-prod measured on the difference between 17 and 33 limbs.
    85  C This isn't a big difference, but it's presumably some alignment effect
    86  C which if understood could give a simple speedup.
    87  
    88  defframe(PARAM_SIZE,12)
    89  defframe(PARAM_SRC, 8)
    90  defframe(PARAM_DST, 4)
    91  
    92  	TEXT
    93  	ALIGN(32)
    94  PROLOGUE(mpn_sqr_basecase)
    95  deflit(`FRAME',0)
    96  
    97  	movl	PARAM_SIZE, %ecx
    98  	movl	PARAM_SRC, %eax
    99  
   100  	cmpl	$2, %ecx
   101  	je	L(two_limbs)
   102  
   103  	movl	PARAM_DST, %edx
   104  	ja	L(three_or_more)
   105  
   106  
   107  C -----------------------------------------------------------------------------
   108  C one limb only
   109  	C eax	src
   110  	C ebx
   111  	C ecx	size
   112  	C edx	dst
   113  
   114  	movl	(%eax), %eax
   115  	movl	%edx, %ecx
   116  
   117  	mull	%eax
   118  
   119  	movl	%eax, (%ecx)
   120  	movl	%edx, 4(%ecx)
   121  	ret
   122  
   123  
   124  C -----------------------------------------------------------------------------
   125  	ALIGN(16)
   126  L(two_limbs):
   127  	C eax	src
   128  	C ebx
   129  	C ecx	size
   130  	C edx	dst
   131  
   132  	pushl	%ebx
   133  	movl	%eax, %ebx	C src
   134  deflit(`FRAME',4)
   135  
   136  	movl	(%ebx), %eax
   137  	movl	PARAM_DST, %ecx
   138  
   139  	mull	%eax		C src[0]^2
   140  
   141  	movl	%eax, (%ecx)
   142  	movl	4(%ebx), %eax
   143  
   144  	movl	%edx, 4(%ecx)
   145  
   146  	mull	%eax		C src[1]^2
   147  
   148  	movl	%eax, 8(%ecx)
   149  	movl	(%ebx), %eax
   150  
   151  	movl	%edx, 12(%ecx)
   152  	movl	4(%ebx), %edx
   153  
   154  	mull	%edx		C src[0]*src[1]
   155  
   156  	addl	%eax, 4(%ecx)
   157  
   158  	adcl	%edx, 8(%ecx)
   159  	adcl	$0, 12(%ecx)
   160  
   161  	popl	%ebx
   162  	addl	%eax, 4(%ecx)
   163  
   164  	adcl	%edx, 8(%ecx)
   165  	adcl	$0, 12(%ecx)
   166  
   167  	ret
   168  
   169  
   170  C -----------------------------------------------------------------------------
   171  L(three_or_more):
   172  deflit(`FRAME',0)
   173  	cmpl	$4, %ecx
   174  	jae	L(four_or_more)
   175  
   176  
   177  C -----------------------------------------------------------------------------
   178  C three limbs
   179  	C eax	src
   180  	C ecx	size
   181  	C edx	dst
   182  
   183  	pushl	%ebx
   184  	movl	%eax, %ebx	C src
   185  
   186  	movl	(%ebx), %eax
   187  	movl	%edx, %ecx	C dst
   188  
   189  	mull	%eax		C src[0] ^ 2
   190  
   191  	movl	%eax, (%ecx)
   192  	movl	4(%ebx), %eax
   193  
   194  	movl	%edx, 4(%ecx)
   195  	pushl	%esi
   196  
   197  	mull	%eax		C src[1] ^ 2
   198  
   199  	movl	%eax, 8(%ecx)
   200  	movl	8(%ebx), %eax
   201  
   202  	movl	%edx, 12(%ecx)
   203  	pushl	%edi
   204  
   205  	mull	%eax		C src[2] ^ 2
   206  
   207  	movl	%eax, 16(%ecx)
   208  	movl	(%ebx), %eax
   209  
   210  	movl	%edx, 20(%ecx)
   211  	movl	4(%ebx), %edx
   212  
   213  	mull	%edx		C src[0] * src[1]
   214  
   215  	movl	%eax, %esi
   216  	movl	(%ebx), %eax
   217  
   218  	movl	%edx, %edi
   219  	movl	8(%ebx), %edx
   220  
   221  	pushl	%ebp
   222  	xorl	%ebp, %ebp
   223  
   224  	mull	%edx		C src[0] * src[2]
   225  
   226  	addl	%eax, %edi
   227  	movl	4(%ebx), %eax
   228  
   229  	adcl	%edx, %ebp
   230  
   231  	movl	8(%ebx), %edx
   232  
   233  	mull	%edx		C src[1] * src[2]
   234  
   235  	addl	%eax, %ebp
   236  
   237  	adcl	$0, %edx
   238  
   239  
   240  	C eax	will be dst[5]
   241  	C ebx
   242  	C ecx	dst
   243  	C edx	dst[4]
   244  	C esi	dst[1]
   245  	C edi	dst[2]
   246  	C ebp	dst[3]
   247  
   248  	xorl	%eax, %eax
   249  	addl	%esi, %esi
   250  	adcl	%edi, %edi
   251  	adcl	%ebp, %ebp
   252  	adcl	%edx, %edx
   253  	adcl	$0, %eax
   254  
   255  	addl	%esi, 4(%ecx)
   256  	adcl	%edi, 8(%ecx)
   257  	adcl	%ebp, 12(%ecx)
   258  
   259  	popl	%ebp
   260  	popl	%edi
   261  
   262  	adcl	%edx, 16(%ecx)
   263  
   264  	popl	%esi
   265  	popl	%ebx
   266  
   267  	adcl	%eax, 20(%ecx)
   268  	ASSERT(nc)
   269  
   270  	ret
   271  
   272  
   273  C -----------------------------------------------------------------------------
   274  
   275  defframe(SAVE_EBX,   -4)
   276  defframe(SAVE_ESI,   -8)
   277  defframe(SAVE_EDI,   -12)
   278  defframe(SAVE_EBP,   -16)
   279  defframe(VAR_COUNTER,-20)
   280  defframe(VAR_JMP,    -24)
   281  deflit(STACK_SPACE, 24)
   282  
   283  	ALIGN(16)
   284  L(four_or_more):
   285  
   286  	C eax	src
   287  	C ebx
   288  	C ecx	size
   289  	C edx	dst
   290  	C esi
   291  	C edi
   292  	C ebp
   293  
   294  C First multiply src[0]*src[1..size-1] and store at dst[1..size].
   295  C
   296  C A test was done calling mpn_mul_1 here to get the benefit of its unrolled
   297  C loop, but this was only a tiny speedup; at 35 limbs it took 24 cycles off
   298  C a 5780 cycle operation, which is not surprising since the loop here is 8
   299  C c/l and mpn_mul_1 is 6.25 c/l.
   300  
   301  	subl	$STACK_SPACE, %esp	deflit(`FRAME',STACK_SPACE)
   302  
   303  	movl	%edi, SAVE_EDI
   304  	leal	4(%edx), %edi
   305  
   306  	movl	%ebx, SAVE_EBX
   307  	leal	4(%eax), %ebx
   308  
   309  	movl	%esi, SAVE_ESI
   310  	xorl	%esi, %esi
   311  
   312  	movl	%ebp, SAVE_EBP
   313  
   314  	C eax
   315  	C ebx	src+4
   316  	C ecx	size
   317  	C edx
   318  	C esi
   319  	C edi	dst+4
   320  	C ebp
   321  
   322  	movl	(%eax), %ebp	C multiplier
   323  	leal	-1(%ecx), %ecx	C size-1, and pad to a 16 byte boundary
   324  
   325  
   326  	ALIGN(16)
   327  L(mul_1):
   328  	C eax	scratch
   329  	C ebx	src ptr
   330  	C ecx	counter
   331  	C edx	scratch
   332  	C esi	carry
   333  	C edi	dst ptr
   334  	C ebp	multiplier
   335  
   336  	movl	(%ebx), %eax
   337  	addl	$4, %ebx
   338  
   339  	mull	%ebp
   340  
   341  	addl	%esi, %eax
   342  	movl	$0, %esi
   343  
   344  	adcl	%edx, %esi
   345  
   346  	movl	%eax, (%edi)
   347  	addl	$4, %edi
   348  
   349  	loop	L(mul_1)
   350  
   351  
   352  C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
   353  C
   354  C The last two addmuls, which are the bottom right corner of the product
   355  C triangle, are left to the end.  These are src[size-3]*src[size-2,size-1]
   356  C and src[size-2]*src[size-1].  If size is 4 then it's only these corner
   357  C cases that need to be done.
   358  C
   359  C The unrolled code is the same as mpn_addmul_1(), see that routine for some
   360  C comments.
   361  C
   362  C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
   363  C
   364  C VAR_JMP is the computed jump into the unrolled code, stepped by one code
   365  C chunk each outer loop.
   366  C
   367  C K6 doesn't do any branch prediction on indirect jumps, which is good
   368  C actually because it's a different target each time.  The unrolled addmul
   369  C is about 3 cycles/limb faster than a simple loop, so the 6 cycle cost of
   370  C the indirect jump is quickly recovered.
   371  
   372  
   373  dnl  This value is also implicitly encoded in a shift and add.
   374  dnl
   375  deflit(CODE_BYTES_PER_LIMB, 15)
   376  
   377  dnl  With the unmodified &src[size] and &dst[size] pointers, the
   378  dnl  displacements in the unrolled code fit in a byte for UNROLL_COUNT
   379  dnl  values up to 31.  Above that an offset must be added to them.
   380  dnl
   381  deflit(OFFSET,
   382  ifelse(eval(UNROLL_COUNT>31),1,
   383  eval((UNROLL_COUNT-31)*4),
   384  0))
   385  
   386  	C eax
   387  	C ebx	&src[size]
   388  	C ecx
   389  	C edx
   390  	C esi	carry
   391  	C edi	&dst[size]
   392  	C ebp
   393  
   394  	movl	PARAM_SIZE, %ecx
   395  	movl	%esi, (%edi)
   396  
   397  	subl	$4, %ecx
   398  	jz	L(corner)
   399  
   400  	movl	%ecx, %edx
   401  ifelse(OFFSET,0,,
   402  `	subl	$OFFSET, %ebx')
   403  
   404  	shll	$4, %ecx
   405  ifelse(OFFSET,0,,
   406  `	subl	$OFFSET, %edi')
   407  
   408  	negl	%ecx
   409  
   410  ifdef(`PIC',`
   411  	call	L(pic_calc)
   412  L(here):
   413  ',`
   414  	leal	L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
   415  ')
   416  	negl	%edx
   417  
   418  
   419  	C The calculated jump mustn't be before the start of the available
   420  	C code.  This is the limitation UNROLL_COUNT puts on the src operand
   421  	C size, but checked here using the jump address directly.
   422  	C
   423  	ASSERT(ae,`
   424  	movl_text_address( L(unroll_inner_start), %eax)
   425  	cmpl	%eax, %ecx
   426  	')
   427  
   428  
   429  C -----------------------------------------------------------------------------
   430  	ALIGN(16)
   431  L(unroll_outer_top):
   432  	C eax
   433  	C ebx	&src[size], constant
   434  	C ecx	VAR_JMP
   435  	C edx	VAR_COUNTER, limbs, negative
   436  	C esi	high limb to store
   437  	C edi	dst ptr, high of last addmul
   438  	C ebp
   439  
   440  	movl	-12+OFFSET(%ebx,%edx,4), %ebp	C multiplier
   441  	movl	%edx, VAR_COUNTER
   442  
   443  	movl	-8+OFFSET(%ebx,%edx,4), %eax	C first limb of multiplicand
   444  
   445  	mull	%ebp
   446  
   447  	testb	$1, %cl
   448  
   449  	movl	%edx, %esi	C high carry
   450  	movl	%ecx, %edx	C jump
   451  
   452  	movl	%eax, %ecx	C low carry
   453  	leal	CODE_BYTES_PER_LIMB(%edx), %edx
   454  
   455  	movl	%edx, VAR_JMP
   456  	leal	4(%edi), %edi
   457  
   458  	C A branch-free version of this using some xors was found to be a
   459  	C touch slower than just a conditional jump, despite the jump
   460  	C switching between taken and not taken on every loop.
   461  
   462  ifelse(eval(UNROLL_COUNT%2),0,
   463  	jz,jnz)	L(unroll_noswap)
   464  	movl	%esi, %eax	C high,low carry other way around
   465  
   466  	movl	%ecx, %esi
   467  	movl	%eax, %ecx
   468  L(unroll_noswap):
   469  
   470  	jmp	*%edx
   471  
   472  
   473  	C Must be on an even address here so the low bit of the jump address
   474  	C will indicate which way around ecx/esi should start.
   475  	C
   476  	C An attempt was made at padding here to get the end of the unrolled
   477  	C code to come out on a good alignment, to save padding before
   478  	C L(corner).  This worked, but turned out to run slower than just an
   479  	C ALIGN(2).  The reason for this is not clear, it might be related
   480  	C to the different speeds on different UNROLL_COUNTs noted above.
   481  
   482  	ALIGN(2)
   483  
   484  L(unroll_inner_start):
   485  	C eax	scratch
   486  	C ebx	src
   487  	C ecx	carry low
   488  	C edx	scratch
   489  	C esi	carry high
   490  	C edi	dst
   491  	C ebp	multiplier
   492  	C
   493  	C 15 code bytes each limb
   494  	C ecx/esi swapped on each chunk
   495  
   496  forloop(`i', UNROLL_COUNT, 1, `
   497  	deflit(`disp_src', eval(-i*4 + OFFSET))
   498  	deflit(`disp_dst', eval(disp_src - 4))
   499  
   500  	m4_assert(`disp_src>=-128 && disp_src<128')
   501  	m4_assert(`disp_dst>=-128 && disp_dst<128')
   502  
   503  ifelse(eval(i%2),0,`
   504  Zdisp(	movl,	disp_src,(%ebx), %eax)
   505  	mull	%ebp
   506  Zdisp(	addl,	%esi, disp_dst,(%edi))
   507  	adcl	%eax, %ecx
   508  	movl	%edx, %esi
   509  	jadcl0( %esi)
   510  ',`
   511  	dnl  this one comes out last
   512  Zdisp(	movl,	disp_src,(%ebx), %eax)
   513  	mull	%ebp
   514  Zdisp(	addl,	%ecx, disp_dst,(%edi))
   515  	adcl	%eax, %esi
   516  	movl	%edx, %ecx
   517  	jadcl0( %ecx)
   518  ')
   519  ')
   520  L(unroll_inner_end):
   521  
   522  	addl	%esi, -4+OFFSET(%edi)
   523  
   524  	movl	VAR_COUNTER, %edx
   525  	jadcl0(	%ecx)
   526  
   527  	movl	%ecx, m4_empty_if_zero(OFFSET)(%edi)
   528  	movl	VAR_JMP, %ecx
   529  
   530  	incl	%edx
   531  	jnz	L(unroll_outer_top)
   532  
   533  
   534  ifelse(OFFSET,0,,`
   535  	addl	$OFFSET, %ebx
   536  	addl	$OFFSET, %edi
   537  ')
   538  
   539  
   540  C -----------------------------------------------------------------------------
   541  	ALIGN(16)
   542  L(corner):
   543  	C ebx	&src[size]
   544  	C edi	&dst[2*size-5]
   545  
   546  	movl	-12(%ebx), %ebp
   547  
   548  	movl	-8(%ebx), %eax
   549  	movl	%eax, %ecx
   550  
   551  	mull	%ebp
   552  
   553  	addl	%eax, -4(%edi)
   554  	adcl	$0, %edx
   555  
   556  	movl	-4(%ebx), %eax
   557  	movl	%edx, %esi
   558  	movl	%eax, %ebx
   559  
   560  	mull	%ebp
   561  
   562  	addl	%esi, %eax
   563  	adcl	$0, %edx
   564  
   565  	addl	%eax, (%edi)
   566  	adcl	$0, %edx
   567  
   568  	movl	%edx, %esi
   569  	movl	%ebx, %eax
   570  
   571  	mull	%ecx
   572  
   573  	addl	%esi, %eax
   574  	movl	%eax, 4(%edi)
   575  
   576  	adcl	$0, %edx
   577  
   578  	movl	%edx, 8(%edi)
   579  
   580  
   581  C -----------------------------------------------------------------------------
   582  C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
   583  C The loop measures about 6 cycles/iteration, though it looks like it should
   584  C decode in 5.
   585  
   586  L(lshift_start):
   587  	movl	PARAM_SIZE, %ecx
   588  
   589  	movl	PARAM_DST, %edi
   590  	subl	$1, %ecx		C size-1 and clear carry
   591  
   592  	movl	PARAM_SRC, %ebx
   593  	movl	%ecx, %edx
   594  
   595  	xorl	%eax, %eax		C ready for adcl
   596  
   597  
   598  	ALIGN(16)
   599  L(lshift):
   600  	C eax
   601  	C ebx	src (for later use)
   602  	C ecx	counter, decrementing
   603  	C edx	size-1 (for later use)
   604  	C esi
   605  	C edi	dst, incrementing
   606  	C ebp
   607  
   608  	rcll	4(%edi)
   609  	rcll	8(%edi)
   610  	leal	8(%edi), %edi
   611  	loop	L(lshift)
   612  
   613  
   614  	adcl	%eax, %eax
   615  
   616  	movl	%eax, 4(%edi)		C dst most significant limb
   617  	movl	(%ebx), %eax		C src[0]
   618  
   619  	leal	4(%ebx,%edx,4), %ebx	C &src[size]
   620  	subl	%edx, %ecx		C -(size-1)
   621  
   622  
   623  C -----------------------------------------------------------------------------
   624  C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
   625  C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
   626  C low limb of src[0]^2.
   627  
   628  
   629  	mull	%eax
   630  
   631  	movl	%eax, (%edi,%ecx,8)	C dst[0]
   632  
   633  
   634  	ALIGN(16)
   635  L(diag):
   636  	C eax	scratch
   637  	C ebx	&src[size]
   638  	C ecx	counter, negative
   639  	C edx	carry
   640  	C esi	scratch
   641  	C edi	dst[2*size-2]
   642  	C ebp
   643  
   644  	movl	(%ebx,%ecx,4), %eax
   645  	movl	%edx, %esi
   646  
   647  	mull	%eax
   648  
   649  	addl	%esi, 4(%edi,%ecx,8)
   650  	adcl	%eax, 8(%edi,%ecx,8)
   651  	adcl	$0, %edx
   652  
   653  	incl	%ecx
   654  	jnz	L(diag)
   655  
   656  
   657  	movl	SAVE_EBX, %ebx
   658  	movl	SAVE_ESI, %esi
   659  
   660  	addl	%edx, 4(%edi)		C dst most significant limb
   661  
   662  	movl	SAVE_EDI, %edi
   663  	movl	SAVE_EBP, %ebp
   664  	addl	$FRAME, %esp
   665  	ret
   666  
   667  
   668  
   669  C -----------------------------------------------------------------------------
   670  ifdef(`PIC',`
   671  L(pic_calc):
   672  	C See mpn/x86/README about old gas bugs
   673  	addl	(%esp), %ecx
   674  	addl	$L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx
   675  	addl	%edx, %ecx
   676  	ret_internal
   677  ')
   678  
   679  
   680  EPILOGUE()