github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k7/sqr_basecase.asm (about)

     1  dnl  AMD K7 mpn_sqr_basecase -- square an mpn number.
     2  
     3  dnl  Copyright 1999-2002 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C K7: approx 2.3 cycles/crossproduct, or 4.55 cycles/triangular product
    35  C     (measured on the speed difference between 25 and 50 limbs, which is
    36  C     roughly the Karatsuba recursing range).
    37  
    38  
    39  dnl  These are the same as mpn/x86/k6/sqr_basecase.asm, see that code for
    40  dnl  some comments.
    41  
    42  deflit(SQR_TOOM2_THRESHOLD_MAX, 66)
    43  
    44  ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE',
    45  `define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)')
    46  
    47  m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD')
    48  deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3))
    49  
    50  
    51  C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
    52  C
    53  C With a SQR_TOOM2_THRESHOLD around 50 this code is about 1500 bytes,
    54  C which is quite a bit, but is considered good value since squares big
    55  C enough to use most of the code will be spending quite a few cycles in it.
    56  
    57  
    58  defframe(PARAM_SIZE,12)
    59  defframe(PARAM_SRC, 8)
    60  defframe(PARAM_DST, 4)
    61  
    62  	TEXT
    63  	ALIGN(32)
    64  PROLOGUE(mpn_sqr_basecase)
    65  deflit(`FRAME',0)
    66  
    67  	movl	PARAM_SIZE, %ecx
    68  	movl	PARAM_SRC, %eax
    69  	cmpl	$2, %ecx
    70  
    71  	movl	PARAM_DST, %edx
    72  	je	L(two_limbs)
    73  	ja	L(three_or_more)
    74  
    75  
    76  C------------------------------------------------------------------------------
    77  C one limb only
    78  	C eax	src
    79  	C ecx	size
    80  	C edx	dst
    81  
    82  	movl	(%eax), %eax
    83  	movl	%edx, %ecx
    84  
    85  	mull	%eax
    86  
    87  	movl	%edx, 4(%ecx)
    88  	movl	%eax, (%ecx)
    89  	ret
    90  
    91  
    92  C------------------------------------------------------------------------------
    93  C
    94  C Using the read/modify/write "add"s seems to be faster than saving and
    95  C restoring registers.  Perhaps the loads for the first set hide under the
    96  C mul latency and the second gets store to load forwarding.
    97  
    98  	ALIGN(16)
    99  L(two_limbs):
   100  	C eax	src
   101  	C ebx
   102  	C ecx	size
   103  	C edx	dst
   104  deflit(`FRAME',0)
   105  
   106  	pushl	%ebx		FRAME_pushl()
   107  	movl	%eax, %ebx	C src
   108  	movl	(%eax), %eax
   109  
   110  	movl	%edx, %ecx	C dst
   111  
   112  	mull	%eax		C src[0]^2
   113  
   114  	movl	%eax, (%ecx)	C dst[0]
   115  	movl	4(%ebx), %eax
   116  
   117  	movl	%edx, 4(%ecx)	C dst[1]
   118  
   119  	mull	%eax		C src[1]^2
   120  
   121  	movl	%eax, 8(%ecx)	C dst[2]
   122  	movl	(%ebx), %eax
   123  
   124  	movl	%edx, 12(%ecx)	C dst[3]
   125  
   126  	mull	4(%ebx)		C src[0]*src[1]
   127  
   128  	popl	%ebx
   129  
   130  	addl	%eax, 4(%ecx)
   131  	adcl	%edx, 8(%ecx)
   132  	adcl	$0, 12(%ecx)
   133  	ASSERT(nc)
   134  
   135  	addl	%eax, 4(%ecx)
   136  	adcl	%edx, 8(%ecx)
   137  	adcl	$0, 12(%ecx)
   138  	ASSERT(nc)
   139  
   140  	ret
   141  
   142  
   143  C------------------------------------------------------------------------------
   144  defframe(SAVE_EBX,  -4)
   145  defframe(SAVE_ESI,  -8)
   146  defframe(SAVE_EDI, -12)
   147  defframe(SAVE_EBP, -16)
   148  deflit(STACK_SPACE, 16)
   149  
   150  L(three_or_more):
   151  	subl	$STACK_SPACE, %esp
   152  	cmpl	$4, %ecx
   153  	jae	L(four_or_more)
   154  deflit(`FRAME',STACK_SPACE)
   155  
   156  
   157  C------------------------------------------------------------------------------
   158  C Three limbs
   159  C
   160  C Writing out the loads and stores separately at the end of this code comes
   161  C out about 10 cycles faster than using adcls to memory.
   162  
   163  	C eax	src
   164  	C ecx	size
   165  	C edx	dst
   166  
   167  	movl	%ebx, SAVE_EBX
   168  	movl	%eax, %ebx	C src
   169  	movl	(%eax), %eax
   170  
   171  	movl	%edx, %ecx	C dst
   172  	movl	%esi, SAVE_ESI
   173  	movl	%edi, SAVE_EDI
   174  
   175  	mull	%eax		C src[0] ^ 2
   176  
   177  	movl	%eax, (%ecx)
   178  	movl	4(%ebx), %eax
   179  	movl	%edx, 4(%ecx)
   180  
   181  	mull	%eax		C src[1] ^ 2
   182  
   183  	movl	%eax, 8(%ecx)
   184  	movl	8(%ebx), %eax
   185  	movl	%edx, 12(%ecx)
   186  
   187  	mull	%eax		C src[2] ^ 2
   188  
   189  	movl	%eax, 16(%ecx)
   190  	movl	(%ebx), %eax
   191  	movl	%edx, 20(%ecx)
   192  
   193  	mull	4(%ebx)		C src[0] * src[1]
   194  
   195  	movl	%eax, %esi
   196  	movl	(%ebx), %eax
   197  	movl	%edx, %edi
   198  
   199  	mull	8(%ebx)		C src[0] * src[2]
   200  
   201  	addl	%eax, %edi
   202  	movl	%ebp, SAVE_EBP
   203  	movl	$0, %ebp
   204  
   205  	movl	4(%ebx), %eax
   206  	adcl	%edx, %ebp
   207  
   208  	mull	8(%ebx)		C src[1] * src[2]
   209  
   210  	xorl	%ebx, %ebx
   211  	addl	%eax, %ebp
   212  
   213  	adcl	$0, %edx
   214  
   215  	C eax
   216  	C ebx	zero, will be dst[5]
   217  	C ecx	dst
   218  	C edx	dst[4]
   219  	C esi	dst[1]
   220  	C edi	dst[2]
   221  	C ebp	dst[3]
   222  
   223  	adcl	$0, %edx
   224  	addl	%esi, %esi
   225  
   226  	adcl	%edi, %edi
   227  	movl	4(%ecx), %eax
   228  
   229  	adcl	%ebp, %ebp
   230  
   231  	adcl	%edx, %edx
   232  
   233  	adcl	$0, %ebx
   234  	addl	%eax, %esi
   235  	movl	8(%ecx), %eax
   236  
   237  	adcl	%eax, %edi
   238  	movl	12(%ecx), %eax
   239  	movl	%esi, 4(%ecx)
   240  
   241  	adcl	%eax, %ebp
   242  	movl	16(%ecx), %eax
   243  	movl	%edi, 8(%ecx)
   244  
   245  	movl	SAVE_ESI, %esi
   246  	movl	SAVE_EDI, %edi
   247  
   248  	adcl	%eax, %edx
   249  	movl	20(%ecx), %eax
   250  	movl	%ebp, 12(%ecx)
   251  
   252  	adcl	%ebx, %eax
   253  	ASSERT(nc)
   254  	movl	SAVE_EBX, %ebx
   255  	movl	SAVE_EBP, %ebp
   256  
   257  	movl	%edx, 16(%ecx)
   258  	movl	%eax, 20(%ecx)
   259  	addl	$FRAME, %esp
   260  
   261  	ret
   262  
   263  
   264  C------------------------------------------------------------------------------
   265  L(four_or_more):
   266  
   267  C First multiply src[0]*src[1..size-1] and store at dst[1..size].
   268  C Further products are added in rather than stored.
   269  
   270  	C eax	src
   271  	C ebx
   272  	C ecx	size
   273  	C edx	dst
   274  	C esi
   275  	C edi
   276  	C ebp
   277  
   278  defframe(`VAR_COUNTER',-20)
   279  defframe(`VAR_JMP',    -24)
   280  deflit(EXTRA_STACK_SPACE, 8)
   281  
   282  	movl	%ebx, SAVE_EBX
   283  	movl	%edi, SAVE_EDI
   284  	leal	(%edx,%ecx,4), %edi	C &dst[size]
   285  
   286  	movl	%esi, SAVE_ESI
   287  	movl	%ebp, SAVE_EBP
   288  	leal	(%eax,%ecx,4), %esi	C &src[size]
   289  
   290  	movl	(%eax), %ebp		C multiplier
   291  	movl	$0, %ebx
   292  	decl	%ecx
   293  
   294  	negl	%ecx
   295  	subl	$EXTRA_STACK_SPACE, %esp
   296  FRAME_subl_esp(EXTRA_STACK_SPACE)
   297  
   298  L(mul_1):
   299  	C eax	scratch
   300  	C ebx	carry
   301  	C ecx	counter
   302  	C edx	scratch
   303  	C esi	&src[size]
   304  	C edi	&dst[size]
   305  	C ebp	multiplier
   306  
   307  	movl	(%esi,%ecx,4), %eax
   308  
   309  	mull	%ebp
   310  
   311  	addl	%ebx, %eax
   312  	movl	%eax, (%edi,%ecx,4)
   313  	movl	$0, %ebx
   314  
   315  	adcl	%edx, %ebx
   316  	incl	%ecx
   317  	jnz	L(mul_1)
   318  
   319  
   320  C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
   321  C
   322  C The last two products, which are the bottom right corner of the product
   323  C triangle, are left to the end.  These are src[size-3]*src[size-2,size-1]
   324  C and src[size-2]*src[size-1].  If size is 4 then it's only these corner
   325  C cases that need to be done.
   326  C
   327  C The unrolled code is the same as in mpn_addmul_1, see that routine for
   328  C some comments.
   329  C
   330  C VAR_COUNTER is the outer loop, running from -size+4 to -1, inclusive.
   331  C
   332  C VAR_JMP is the computed jump into the unrolled code, stepped by one code
   333  C chunk each outer loop.
   334  C
   335  C K7 does branch prediction on indirect jumps, which is bad since it's a
   336  C different target each time.  There seems no way to avoid this.
   337  
   338  dnl  This value also hard coded in some shifts and adds
   339  deflit(CODE_BYTES_PER_LIMB, 17)
   340  
   341  dnl  With the unmodified &src[size] and &dst[size] pointers, the
   342  dnl  displacements in the unrolled code fit in a byte for UNROLL_COUNT
   343  dnl  values up to 31, but above that an offset must be added to them.
   344  
   345  deflit(OFFSET,
   346  ifelse(eval(UNROLL_COUNT>31),1,
   347  eval((UNROLL_COUNT-31)*4),
   348  0))
   349  
   350  dnl  Because the last chunk of code is generated differently, a label placed
   351  dnl  at the end doesn't work.  Instead calculate the implied end using the
   352  dnl  start and how many chunks of code there are.
   353  
   354  deflit(UNROLL_INNER_END,
   355  `L(unroll_inner_start)+eval(UNROLL_COUNT*CODE_BYTES_PER_LIMB)')
   356  
   357  	C eax
   358  	C ebx	carry
   359  	C ecx
   360  	C edx
   361  	C esi	&src[size]
   362  	C edi	&dst[size]
   363  	C ebp
   364  
   365  	movl	PARAM_SIZE, %ecx
   366  	movl	%ebx, (%edi)
   367  
   368  	subl	$4, %ecx
   369  	jz	L(corner)
   370  
   371  	negl	%ecx
   372  ifelse(OFFSET,0,,`subl	$OFFSET, %edi')
   373  ifelse(OFFSET,0,,`subl	$OFFSET, %esi')
   374  
   375  	movl	%ecx, %edx
   376  	shll	$4, %ecx
   377  
   378  ifdef(`PIC',`
   379  	call	L(pic_calc)
   380  L(here):
   381  ',`
   382  	leal	UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
   383  ')
   384  
   385  
   386  	C The calculated jump mustn't come out to before the start of the
   387  	C code available.  This is the limit UNROLL_COUNT puts on the src
   388  	C operand size, but checked here directly using the jump address.
   389  	ASSERT(ae,
   390  	`movl_text_address(L(unroll_inner_start), %eax)
   391  	cmpl	%eax, %ecx')
   392  
   393  
   394  C------------------------------------------------------------------------------
   395  	ALIGN(16)
   396  L(unroll_outer_top):
   397  	C eax
   398  	C ebx	high limb to store
   399  	C ecx	VAR_JMP
   400  	C edx	VAR_COUNTER, limbs, negative
   401  	C esi	&src[size], constant
   402  	C edi	dst ptr, high of last addmul
   403  	C ebp
   404  
   405  	movl	-12+OFFSET(%esi,%edx,4), %ebp	C next multiplier
   406  	movl	-8+OFFSET(%esi,%edx,4), %eax	C first of multiplicand
   407  
   408  	movl	%edx, VAR_COUNTER
   409  
   410  	mull	%ebp
   411  
   412  define(cmovX,`ifelse(eval(UNROLL_COUNT%2),0,`cmovz($@)',`cmovnz($@)')')
   413  
   414  	testb	$1, %cl
   415  	movl	%edx, %ebx	C high carry
   416  	movl	%ecx, %edx	C jump
   417  
   418  	movl	%eax, %ecx	C low carry
   419  	cmovX(	%ebx, %ecx)	C high carry reverse
   420  	cmovX(	%eax, %ebx)	C low carry reverse
   421  
   422  	leal	CODE_BYTES_PER_LIMB(%edx), %eax
   423  	xorl	%edx, %edx
   424  	leal	4(%edi), %edi
   425  
   426  	movl	%eax, VAR_JMP
   427  
   428  	jmp	*%eax
   429  
   430  
   431  ifdef(`PIC',`
   432  L(pic_calc):
   433  	addl	(%esp), %ecx
   434  	addl	$UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)-L(here), %ecx
   435  	addl	%edx, %ecx
   436  	ret_internal
   437  ')
   438  
   439  
   440  	C Must be an even address to preserve the significance of the low
   441  	C bit of the jump address indicating which way around ecx/ebx should
   442  	C start.
   443  	ALIGN(2)
   444  
   445  L(unroll_inner_start):
   446  	C eax	next limb
   447  	C ebx	carry high
   448  	C ecx	carry low
   449  	C edx	scratch
   450  	C esi	src
   451  	C edi	dst
   452  	C ebp	multiplier
   453  
   454  forloop(`i', UNROLL_COUNT, 1, `
   455  	deflit(`disp_src', eval(-i*4 + OFFSET))
   456  	deflit(`disp_dst', eval(disp_src - 4))
   457  
   458  	m4_assert(`disp_src>=-128 && disp_src<128')
   459  	m4_assert(`disp_dst>=-128 && disp_dst<128')
   460  
   461  ifelse(eval(i%2),0,`
   462  Zdisp(	movl,	disp_src,(%esi), %eax)
   463  	adcl	%edx, %ebx
   464  
   465  	mull	%ebp
   466  
   467  Zdisp(  addl,	%ecx, disp_dst,(%edi))
   468  	movl	$0, %ecx
   469  
   470  	adcl	%eax, %ebx
   471  
   472  ',`
   473  	dnl  this bit comes out last
   474  Zdisp(  movl,	disp_src,(%esi), %eax)
   475  	adcl	%edx, %ecx
   476  
   477  	mull	%ebp
   478  
   479  Zdisp(	addl,	%ebx, disp_dst,(%edi))
   480  
   481  ifelse(forloop_last,0,
   482  `	movl	$0, %ebx')
   483  
   484  	adcl	%eax, %ecx
   485  ')
   486  ')
   487  
   488  	C eax	next limb
   489  	C ebx	carry high
   490  	C ecx	carry low
   491  	C edx	scratch
   492  	C esi	src
   493  	C edi	dst
   494  	C ebp	multiplier
   495  
   496  	adcl	$0, %edx
   497  	addl	%ecx, -4+OFFSET(%edi)
   498  	movl	VAR_JMP, %ecx
   499  
   500  	adcl	$0, %edx
   501  
   502  	movl	%edx, m4_empty_if_zero(OFFSET) (%edi)
   503  	movl	VAR_COUNTER, %edx
   504  
   505  	incl	%edx
   506  	jnz	L(unroll_outer_top)
   507  
   508  
   509  ifelse(OFFSET,0,,`
   510  	addl	$OFFSET, %esi
   511  	addl	$OFFSET, %edi
   512  ')
   513  
   514  
   515  C------------------------------------------------------------------------------
   516  L(corner):
   517  	C esi	&src[size]
   518  	C edi	&dst[2*size-5]
   519  
   520  	movl	-12(%esi), %ebp
   521  	movl	-8(%esi), %eax
   522  	movl	%eax, %ecx
   523  
   524  	mull	%ebp
   525  
   526  	addl	%eax, -4(%edi)
   527  	movl	-4(%esi), %eax
   528  
   529  	adcl	$0, %edx
   530  	movl	%edx, %ebx
   531  	movl	%eax, %esi
   532  
   533  	mull	%ebp
   534  
   535  	addl	%ebx, %eax
   536  
   537  	adcl	$0, %edx
   538  	addl	%eax, (%edi)
   539  	movl	%esi, %eax
   540  
   541  	adcl	$0, %edx
   542  	movl	%edx, %ebx
   543  
   544  	mull	%ecx
   545  
   546  	addl	%ebx, %eax
   547  	movl	%eax, 4(%edi)
   548  
   549  	adcl	$0, %edx
   550  	movl	%edx, 8(%edi)
   551  
   552  
   553  
   554  C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
   555  
   556  L(lshift_start):
   557  	movl	PARAM_SIZE, %eax
   558  	movl	PARAM_DST, %edi
   559  	xorl	%ecx, %ecx		C clear carry
   560  
   561  	leal	(%edi,%eax,8), %edi
   562  	notl	%eax			C -size-1, preserve carry
   563  
   564  	leal	2(%eax), %eax		C -(size-1)
   565  
   566  L(lshift):
   567  	C eax	counter, negative
   568  	C ebx
   569  	C ecx
   570  	C edx
   571  	C esi
   572  	C edi	dst, pointing just after last limb
   573  	C ebp
   574  
   575  	rcll	-4(%edi,%eax,8)
   576  	rcll	(%edi,%eax,8)
   577  	incl	%eax
   578  	jnz	L(lshift)
   579  
   580  	setc	%al
   581  
   582  	movl	PARAM_SRC, %esi
   583  	movl	%eax, -4(%edi)		C dst most significant limb
   584  
   585  	movl	PARAM_SIZE, %ecx
   586  
   587  
   588  C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
   589  C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
   590  C low limb of src[0]^2.
   591  
   592  	movl	(%esi), %eax		C src[0]
   593  
   594  	mull	%eax
   595  
   596  	leal	(%esi,%ecx,4), %esi	C src point just after last limb
   597  	negl	%ecx
   598  
   599  	movl	%eax, (%edi,%ecx,8)	C dst[0]
   600  	incl	%ecx
   601  
   602  L(diag):
   603  	C eax	scratch
   604  	C ebx	scratch
   605  	C ecx	counter, negative
   606  	C edx	carry
   607  	C esi	src just after last limb
   608  	C edi	dst just after last limb
   609  	C ebp
   610  
   611  	movl	(%esi,%ecx,4), %eax
   612  	movl	%edx, %ebx
   613  
   614  	mull	%eax
   615  
   616  	addl	%ebx, -4(%edi,%ecx,8)
   617  	adcl	%eax, (%edi,%ecx,8)
   618  	adcl	$0, %edx
   619  
   620  	incl	%ecx
   621  	jnz	L(diag)
   622  
   623  
   624  	movl	SAVE_ESI, %esi
   625  	movl	SAVE_EBX, %ebx
   626  
   627  	addl	%edx, -4(%edi)		C dst most significant limb
   628  	movl	SAVE_EDI, %edi
   629  
   630  	movl	SAVE_EBP, %ebp
   631  	addl	$FRAME, %esp
   632  
   633  	ret
   634  
   635  EPILOGUE()