github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/p6/sqr_basecase.asm

github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/p6/sqr_basecase.asm (about)

     1  dnl  Intel P6 mpn_sqr_basecase -- square an mpn number.
     2  
     3  dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C P6: approx 4.0 cycles per cross product, or 7.75 cycles per triangular
    35  C     product (measured on the speed difference between 20 and 40 limbs,
    36  C     which is the Karatsuba recursing range).
    37  
    38  
    39  dnl  These are the same as in mpn/x86/k6/sqr_basecase.asm, see that file for
    40  dnl  a description.  The only difference here is that UNROLL_COUNT can go up
    41  dnl  to 64 (not 63) making SQR_TOOM2_THRESHOLD_MAX 67.
    42  
    43  deflit(SQR_TOOM2_THRESHOLD_MAX, 67)
    44  
    45  ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE',
    46  `define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)')
    47  
    48  m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD')
    49  deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3))
    50  
    51  
    52  C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
    53  C
    54  C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
    55  C lot of function call overheads are avoided, especially when the given size
    56  C is small.
    57  C
    58  C The code size might look a bit excessive, but not all of it is executed so
    59  C it won't all get into the code cache.  The 1x1, 2x2 and 3x3 special cases
    60  C clearly apply only to those sizes; mid sizes like 10x10 only need part of
    61  C the unrolled addmul; and big sizes like 40x40 that do use the full
    62  C unrolling will least be making good use of it, because 40x40 will take
    63  C something like 7000 cycles.
    64  
    65  defframe(PARAM_SIZE,12)
    66  defframe(PARAM_SRC, 8)
    67  defframe(PARAM_DST, 4)
    68  
    69  	TEXT
    70  	ALIGN(32)
    71  PROLOGUE(mpn_sqr_basecase)
    72  deflit(`FRAME',0)
    73  
    74  	movl	PARAM_SIZE, %edx
    75  
    76  	movl	PARAM_SRC, %eax
    77  
    78  	cmpl	$2, %edx
    79  	movl	PARAM_DST, %ecx
    80  	je	L(two_limbs)
    81  
    82  	movl	(%eax), %eax
    83  	ja	L(three_or_more)
    84  
    85  
    86  C -----------------------------------------------------------------------------
    87  C one limb only
    88  	C eax	src limb
    89  	C ebx
    90  	C ecx	dst
    91  	C edx
    92  
    93  	mull	%eax
    94  
    95  	movl	%eax, (%ecx)
    96  	movl	%edx, 4(%ecx)
    97  
    98  	ret
    99  
   100  
   101  C -----------------------------------------------------------------------------
   102  L(two_limbs):
   103  	C eax	src
   104  	C ebx
   105  	C ecx	dst
   106  	C edx
   107  
   108  defframe(SAVE_ESI, -4)
   109  defframe(SAVE_EBX, -8)
   110  defframe(SAVE_EDI, -12)
   111  defframe(SAVE_EBP, -16)
   112  deflit(`STACK_SPACE',16)
   113  
   114  	subl	$STACK_SPACE, %esp
   115  deflit(`FRAME',STACK_SPACE)
   116  
   117  	movl	%esi, SAVE_ESI
   118  	movl	%eax, %esi
   119  	movl	(%eax), %eax
   120  
   121  	mull	%eax		C src[0]^2
   122  
   123  	movl	%eax, (%ecx)	C dst[0]
   124  	movl	4(%esi), %eax
   125  
   126  	movl	%ebx, SAVE_EBX
   127  	movl	%edx, %ebx	C dst[1]
   128  
   129  	mull	%eax		C src[1]^2
   130  
   131  	movl	%edi, SAVE_EDI
   132  	movl	%eax, %edi	C dst[2]
   133  	movl	(%esi), %eax
   134  
   135  	movl	%ebp, SAVE_EBP
   136  	movl	%edx, %ebp	C dst[3]
   137  
   138  	mull	4(%esi)		C src[0]*src[1]
   139  
   140  	addl	%eax, %ebx
   141  	movl	SAVE_ESI, %esi
   142  
   143  	adcl	%edx, %edi
   144  
   145  	adcl	$0, %ebp
   146  	addl	%ebx, %eax
   147  	movl	SAVE_EBX, %ebx
   148  
   149  	adcl	%edi, %edx
   150  	movl	SAVE_EDI, %edi
   151  
   152  	adcl	$0, %ebp
   153  
   154  	movl	%eax, 4(%ecx)
   155  
   156  	movl	%ebp, 12(%ecx)
   157  	movl	SAVE_EBP, %ebp
   158  
   159  	movl	%edx, 8(%ecx)
   160  	addl	$FRAME, %esp
   161  
   162  	ret
   163  
   164  
   165  C -----------------------------------------------------------------------------
   166  L(three_or_more):
   167  	C eax	src low limb
   168  	C ebx
   169  	C ecx	dst
   170  	C edx	size
   171  deflit(`FRAME',0)
   172  
   173  	pushl	%esi	defframe_pushl(`SAVE_ESI')
   174  	cmpl	$4, %edx
   175  
   176  	movl	PARAM_SRC, %esi
   177  	jae	L(four_or_more)
   178  
   179  
   180  C -----------------------------------------------------------------------------
   181  C three limbs
   182  
   183  	C eax	src low limb
   184  	C ebx
   185  	C ecx	dst
   186  	C edx
   187  	C esi	src
   188  	C edi
   189  	C ebp
   190  
   191  	pushl	%ebp	defframe_pushl(`SAVE_EBP')
   192  	pushl	%edi	defframe_pushl(`SAVE_EDI')
   193  
   194  	mull	%eax		C src[0] ^ 2
   195  
   196  	movl	%eax, (%ecx)
   197  	movl	%edx, 4(%ecx)
   198  
   199  	movl	4(%esi), %eax
   200  	xorl	%ebp, %ebp
   201  
   202  	mull	%eax		C src[1] ^ 2
   203  
   204  	movl	%eax, 8(%ecx)
   205  	movl	%edx, 12(%ecx)
   206  	movl	8(%esi), %eax
   207  
   208  	pushl	%ebx	defframe_pushl(`SAVE_EBX')
   209  
   210  	mull	%eax		C src[2] ^ 2
   211  
   212  	movl	%eax, 16(%ecx)
   213  	movl	%edx, 20(%ecx)
   214  
   215  	movl	(%esi), %eax
   216  
   217  	mull	4(%esi)		C src[0] * src[1]
   218  
   219  	movl	%eax, %ebx
   220  	movl	%edx, %edi
   221  
   222  	movl	(%esi), %eax
   223  
   224  	mull	8(%esi)		C src[0] * src[2]
   225  
   226  	addl	%eax, %edi
   227  	movl	%edx, %ebp
   228  
   229  	adcl	$0, %ebp
   230  	movl	4(%esi), %eax
   231  
   232  	mull	8(%esi)		C src[1] * src[2]
   233  
   234  	xorl	%esi, %esi
   235  	addl	%eax, %ebp
   236  
   237  	C eax
   238  	C ebx	dst[1]
   239  	C ecx	dst
   240  	C edx	dst[4]
   241  	C esi	zero, will be dst[5]
   242  	C edi	dst[2]
   243  	C ebp	dst[3]
   244  
   245  	adcl	$0, %edx
   246  	addl	%ebx, %ebx
   247  
   248  	adcl	%edi, %edi
   249  
   250  	adcl	%ebp, %ebp
   251  
   252  	adcl	%edx, %edx
   253  	movl	4(%ecx), %eax
   254  
   255  	adcl	$0, %esi
   256  	addl	%ebx, %eax
   257  
   258  	movl	%eax, 4(%ecx)
   259  	movl	8(%ecx), %eax
   260  
   261  	adcl	%edi, %eax
   262  	movl	12(%ecx), %ebx
   263  
   264  	adcl	%ebp, %ebx
   265  	movl	16(%ecx), %edi
   266  
   267  	movl	%eax, 8(%ecx)
   268  	movl	SAVE_EBP, %ebp
   269  
   270  	movl	%ebx, 12(%ecx)
   271  	movl	SAVE_EBX, %ebx
   272  
   273  	adcl	%edx, %edi
   274  	movl	20(%ecx), %eax
   275  
   276  	movl	%edi, 16(%ecx)
   277  	movl	SAVE_EDI, %edi
   278  
   279  	adcl	%esi, %eax	C no carry out of this
   280  	movl	SAVE_ESI, %esi
   281  
   282  	movl	%eax, 20(%ecx)
   283  	addl	$FRAME, %esp
   284  
   285  	ret
   286  
   287  
   288  
   289  C -----------------------------------------------------------------------------
   290  defframe(VAR_COUNTER,-20)
   291  defframe(VAR_JMP,    -24)
   292  deflit(`STACK_SPACE',24)
   293  
   294  L(four_or_more):
   295  	C eax	src low limb
   296  	C ebx
   297  	C ecx
   298  	C edx	size
   299  	C esi	src
   300  	C edi
   301  	C ebp
   302  deflit(`FRAME',4)  dnl  %esi already pushed
   303  
   304  C First multiply src[0]*src[1..size-1] and store at dst[1..size].
   305  
   306  	subl	$STACK_SPACE-FRAME, %esp
   307  deflit(`FRAME',STACK_SPACE)
   308  	movl	$1, %ecx
   309  
   310  	movl	%edi, SAVE_EDI
   311  	movl	PARAM_DST, %edi
   312  
   313  	movl	%ebx, SAVE_EBX
   314  	subl	%edx, %ecx		C -(size-1)
   315  
   316  	movl	%ebp, SAVE_EBP
   317  	movl	$0, %ebx		C initial carry
   318  
   319  	leal	(%esi,%edx,4), %esi	C &src[size]
   320  	movl	%eax, %ebp		C multiplier
   321  
   322  	leal	-4(%edi,%edx,4), %edi	C &dst[size-1]
   323  
   324  
   325  C This loop runs at just over 6 c/l.
   326  
   327  L(mul_1):
   328  	C eax	scratch
   329  	C ebx	carry
   330  	C ecx	counter, limbs, negative, -(size-1) to -1
   331  	C edx	scratch
   332  	C esi	&src[size]
   333  	C edi	&dst[size-1]
   334  	C ebp	multiplier
   335  
   336  	movl	%ebp, %eax
   337  
   338  	mull	(%esi,%ecx,4)
   339  
   340  	addl	%ebx, %eax
   341  	movl	$0, %ebx
   342  
   343  	adcl	%edx, %ebx
   344  	movl	%eax, 4(%edi,%ecx,4)
   345  
   346  	incl	%ecx
   347  	jnz	L(mul_1)
   348  
   349  
   350  	movl	%ebx, 4(%edi)
   351  
   352  
   353  C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
   354  C
   355  C The last two addmuls, which are the bottom right corner of the product
   356  C triangle, are left to the end.  These are src[size-3]*src[size-2,size-1]
   357  C and src[size-2]*src[size-1].  If size is 4 then it's only these corner
   358  C cases that need to be done.
   359  C
   360  C The unrolled code is the same as mpn_addmul_1(), see that routine for some
   361  C comments.
   362  C
   363  C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
   364  C
   365  C VAR_JMP is the computed jump into the unrolled code, stepped by one code
   366  C chunk each outer loop.
   367  
   368  dnl  This is also hard-coded in the address calculation below.
   369  deflit(CODE_BYTES_PER_LIMB, 15)
   370  
   371  dnl  With &src[size] and &dst[size-1] pointers, the displacements in the
   372  dnl  unrolled code fit in a byte for UNROLL_COUNT values up to 32, but above
   373  dnl  that an offset must be added to them.
   374  deflit(OFFSET,
   375  ifelse(eval(UNROLL_COUNT>32),1,
   376  eval((UNROLL_COUNT-32)*4),
   377  0))
   378  
   379  	C eax
   380  	C ebx	carry
   381  	C ecx
   382  	C edx
   383  	C esi	&src[size]
   384  	C edi	&dst[size-1]
   385  	C ebp
   386  
   387  	movl	PARAM_SIZE, %ecx
   388  
   389  	subl	$4, %ecx
   390  	jz	L(corner)
   391  
   392  	movl	%ecx, %edx
   393  	negl	%ecx
   394  
   395  	shll	$4, %ecx
   396  ifelse(OFFSET,0,,`subl	$OFFSET, %esi')
   397  
   398  ifdef(`PIC',`
   399  	call	L(pic_calc)
   400  L(here):
   401  ',`
   402  	leal	L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
   403  ')
   404  	negl	%edx
   405  
   406  ifelse(OFFSET,0,,`subl	$OFFSET, %edi')
   407  
   408  	C The calculated jump mustn't be before the start of the available
   409  	C code.  This is the limit that UNROLL_COUNT puts on the src operand
   410  	C size, but checked here using the jump address directly.
   411  
   412  	ASSERT(ae,
   413  	`movl_text_address( L(unroll_inner_start), %eax)
   414  	cmpl	%eax, %ecx')
   415  
   416  
   417  C -----------------------------------------------------------------------------
   418  	ALIGN(16)
   419  L(unroll_outer_top):
   420  	C eax
   421  	C ebx	high limb to store
   422  	C ecx	VAR_JMP
   423  	C edx	VAR_COUNTER, limbs, negative
   424  	C esi	&src[size], constant
   425  	C edi	dst ptr, second highest limb of last addmul
   426  	C ebp
   427  
   428  	movl	-12+OFFSET(%esi,%edx,4), %ebp	C multiplier
   429  	movl	%edx, VAR_COUNTER
   430  
   431  	movl	-8+OFFSET(%esi,%edx,4), %eax	C first limb of multiplicand
   432  
   433  	mull	%ebp
   434  
   435  define(cmovX,`ifelse(eval(UNROLL_COUNT%2),1,`cmovz($@)',`cmovnz($@)')')
   436  
   437  	testb	$1, %cl
   438  
   439  	movl	%edx, %ebx	C high carry
   440  	leal	4(%edi), %edi
   441  
   442  	movl	%ecx, %edx	C jump
   443  
   444  	movl	%eax, %ecx	C low carry
   445  	leal	CODE_BYTES_PER_LIMB(%edx), %edx
   446  
   447  	cmovX(	%ebx, %ecx)	C high carry reverse
   448  	cmovX(	%eax, %ebx)	C low carry reverse
   449  	movl	%edx, VAR_JMP
   450  	jmp	*%edx
   451  
   452  
   453  	C Must be on an even address here so the low bit of the jump address
   454  	C will indicate which way around ecx/ebx should start.
   455  
   456  	ALIGN(2)
   457  
   458  L(unroll_inner_start):
   459  	C eax	scratch
   460  	C ebx	carry high
   461  	C ecx	carry low
   462  	C edx	scratch
   463  	C esi	src pointer
   464  	C edi	dst pointer
   465  	C ebp	multiplier
   466  	C
   467  	C 15 code bytes each limb
   468  	C ecx/ebx reversed on each chunk
   469  
   470  forloop(`i', UNROLL_COUNT, 1, `
   471  	deflit(`disp_src', eval(-i*4 + OFFSET))
   472  	deflit(`disp_dst', eval(disp_src))
   473  
   474  	m4_assert(`disp_src>=-128 && disp_src<128')
   475  	m4_assert(`disp_dst>=-128 && disp_dst<128')
   476  
   477  ifelse(eval(i%2),0,`
   478  Zdisp(	movl,	disp_src,(%esi), %eax)
   479  	mull	%ebp
   480  Zdisp(	addl,	%ebx, disp_dst,(%edi))
   481  	adcl	%eax, %ecx
   482  	movl	%edx, %ebx
   483  	adcl	$0, %ebx
   484  ',`
   485  	dnl  this one comes out last
   486  Zdisp(	movl,	disp_src,(%esi), %eax)
   487  	mull	%ebp
   488  Zdisp(	addl,	%ecx, disp_dst,(%edi))
   489  	adcl	%eax, %ebx
   490  	movl	%edx, %ecx
   491  	adcl	$0, %ecx
   492  ')
   493  ')
   494  L(unroll_inner_end):
   495  
   496  	addl	%ebx, m4_empty_if_zero(OFFSET)(%edi)
   497  
   498  	movl	VAR_COUNTER, %edx
   499  	adcl	$0, %ecx
   500  
   501  	movl	%ecx, m4_empty_if_zero(OFFSET+4)(%edi)
   502  	movl	VAR_JMP, %ecx
   503  
   504  	incl	%edx
   505  	jnz	L(unroll_outer_top)
   506  
   507  
   508  ifelse(OFFSET,0,,`
   509  	addl	$OFFSET, %esi
   510  	addl	$OFFSET, %edi
   511  ')
   512  
   513  
   514  C -----------------------------------------------------------------------------
   515  	ALIGN(16)
   516  L(corner):
   517  	C eax
   518  	C ebx
   519  	C ecx
   520  	C edx
   521  	C esi	&src[size]
   522  	C edi	&dst[2*size-5]
   523  	C ebp
   524  
   525  	movl	-12(%esi), %eax
   526  
   527  	mull	-8(%esi)
   528  
   529  	addl	%eax, (%edi)
   530  	movl	-12(%esi), %eax
   531  	movl	$0, %ebx
   532  
   533  	adcl	%edx, %ebx
   534  
   535  	mull	-4(%esi)
   536  
   537  	addl	%eax, %ebx
   538  	movl	-8(%esi), %eax
   539  
   540  	adcl	$0, %edx
   541  
   542  	addl	%ebx, 4(%edi)
   543  	movl	$0, %ebx
   544  
   545  	adcl	%edx, %ebx
   546  
   547  	mull	-4(%esi)
   548  
   549  	movl	PARAM_SIZE, %ecx
   550  	addl	%ebx, %eax
   551  
   552  	adcl	$0, %edx
   553  
   554  	movl	%eax, 8(%edi)
   555  
   556  	movl	%edx, 12(%edi)
   557  	movl	PARAM_DST, %edi
   558  
   559  
   560  C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
   561  
   562  	subl	$1, %ecx		C size-1
   563  	xorl	%eax, %eax		C ready for final adcl, and clear carry
   564  
   565  	movl	%ecx, %edx
   566  	movl	PARAM_SRC, %esi
   567  
   568  
   569  L(lshift):
   570  	C eax
   571  	C ebx
   572  	C ecx	counter, size-1 to 1
   573  	C edx	size-1 (for later use)
   574  	C esi	src (for later use)
   575  	C edi	dst, incrementing
   576  	C ebp
   577  
   578  	rcll	4(%edi)
   579  	rcll	8(%edi)
   580  
   581  	leal	8(%edi), %edi
   582  	decl	%ecx
   583  	jnz	L(lshift)
   584  
   585  
   586  	adcl	%eax, %eax
   587  
   588  	movl	%eax, 4(%edi)		C dst most significant limb
   589  	movl	(%esi), %eax		C src[0]
   590  
   591  	leal	4(%esi,%edx,4), %esi	C &src[size]
   592  	subl	%edx, %ecx		C -(size-1)
   593  
   594  
   595  C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
   596  C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
   597  C low limb of src[0]^2.
   598  
   599  
   600  	mull	%eax
   601  
   602  	movl	%eax, (%edi,%ecx,8)	C dst[0]
   603  
   604  
   605  L(diag):
   606  	C eax	scratch
   607  	C ebx	scratch
   608  	C ecx	counter, negative
   609  	C edx	carry
   610  	C esi	&src[size]
   611  	C edi	dst[2*size-2]
   612  	C ebp
   613  
   614  	movl	(%esi,%ecx,4), %eax
   615  	movl	%edx, %ebx
   616  
   617  	mull	%eax
   618  
   619  	addl	%ebx, 4(%edi,%ecx,8)
   620  	adcl	%eax, 8(%edi,%ecx,8)
   621  	adcl	$0, %edx
   622  
   623  	incl	%ecx
   624  	jnz	L(diag)
   625  
   626  
   627  	movl	SAVE_ESI, %esi
   628  	movl	SAVE_EBX, %ebx
   629  
   630  	addl	%edx, 4(%edi)		C dst most significant limb
   631  
   632  	movl	SAVE_EDI, %edi
   633  	movl	SAVE_EBP, %ebp
   634  	addl	$FRAME, %esp
   635  	ret
   636  
   637  
   638  
   639  C -----------------------------------------------------------------------------
   640  ifdef(`PIC',`
   641  L(pic_calc):
   642  	addl	(%esp), %ecx
   643  	addl	$L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx
   644  	addl	%edx, %ecx
   645  	ret_internal
   646  ')
   647  
   648  
   649  EPILOGUE()