github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium/sqr_basecase.asm (about)

     1  dnl  Intel P5 mpn_sqr_basecase -- square an mpn number.
     2  
     3  dnl  Copyright 1999-2002 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C P5: approx 8 cycles per crossproduct, or 15.5 cycles per triangular
    35  C product at around 20x20 limbs.
    36  
    37  
    38  C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
    39  C
    40  C Calculate src,size squared, storing the result in dst,2*size.
    41  C
    42  C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
    43  C lot of function call overheads are avoided, especially when the size is
    44  C small.
    45  
    46  defframe(PARAM_SIZE,12)
    47  defframe(PARAM_SRC, 8)
    48  defframe(PARAM_DST, 4)
    49  
    50  	TEXT
    51  	ALIGN(8)
    52  PROLOGUE(mpn_sqr_basecase)
    53  deflit(`FRAME',0)
    54  
    55  	movl	PARAM_SIZE, %edx
    56  	movl	PARAM_SRC, %eax
    57  
    58  	cmpl	$2, %edx
    59  	movl	PARAM_DST, %ecx
    60  
    61  	je	L(two_limbs)
    62  
    63  	movl	(%eax), %eax
    64  	ja	L(three_or_more)
    65  
    66  C -----------------------------------------------------------------------------
    67  C one limb only
    68  	C eax	src
    69  	C ebx
    70  	C ecx	dst
    71  	C edx
    72  
    73  	mull	%eax
    74  
    75  	movl	%eax, (%ecx)
    76  	movl	%edx, 4(%ecx)
    77  
    78  	ret
    79  
    80  C -----------------------------------------------------------------------------
    81  	ALIGN(8)
    82  L(two_limbs):
    83  	C eax	src
    84  	C ebx
    85  	C ecx	dst
    86  	C edx	size
    87  
    88  	pushl	%ebp
    89  	pushl	%edi
    90  
    91  	pushl	%esi
    92  	pushl	%ebx
    93  
    94  	movl	%eax, %ebx
    95  	movl	(%eax), %eax
    96  
    97  	mull	%eax		C src[0]^2
    98  
    99  	movl	%eax, (%ecx)	C dst[0]
   100  	movl	%edx, %esi	C dst[1]
   101  
   102  	movl	4(%ebx), %eax
   103  
   104  	mull	%eax		C src[1]^2
   105  
   106  	movl	%eax, %edi	C dst[2]
   107  	movl	%edx, %ebp	C dst[3]
   108  
   109  	movl	(%ebx), %eax
   110  
   111  	mull	4(%ebx)		C src[0]*src[1]
   112  
   113  	addl	%eax, %esi
   114  	popl	%ebx
   115  
   116  	adcl	%edx, %edi
   117  
   118  	adcl	$0, %ebp
   119  	addl	%esi, %eax
   120  
   121  	adcl	%edi, %edx
   122  	movl	%eax, 4(%ecx)
   123  
   124  	adcl	$0, %ebp
   125  	popl	%esi
   126  
   127  	movl	%edx, 8(%ecx)
   128  	movl	%ebp, 12(%ecx)
   129  
   130  	popl	%edi
   131  	popl	%ebp
   132  
   133  	ret
   134  
   135  
   136  C -----------------------------------------------------------------------------
   137  	ALIGN(8)
   138  L(three_or_more):
   139  	C eax	src low limb
   140  	C ebx
   141  	C ecx	dst
   142  	C edx	size
   143  
   144  	cmpl	$4, %edx
   145  	pushl	%ebx
   146  deflit(`FRAME',4)
   147  
   148  	movl	PARAM_SRC, %ebx
   149  	jae	L(four_or_more)
   150  
   151  
   152  C -----------------------------------------------------------------------------
   153  C three limbs
   154  	C eax	src low limb
   155  	C ebx	src
   156  	C ecx	dst
   157  	C edx	size
   158  
   159  	pushl	%ebp
   160  	pushl	%edi
   161  
   162  	mull	%eax		C src[0] ^ 2
   163  
   164  	movl	%eax, (%ecx)
   165  	movl	%edx, 4(%ecx)
   166  
   167  	movl	4(%ebx), %eax
   168  	xorl	%ebp, %ebp
   169  
   170  	mull	%eax		C src[1] ^ 2
   171  
   172  	movl	%eax, 8(%ecx)
   173  	movl	%edx, 12(%ecx)
   174  
   175  	movl	8(%ebx), %eax
   176  	pushl	%esi		C risk of cache bank clash
   177  
   178  	mull	%eax		C src[2] ^ 2
   179  
   180  	movl	%eax, 16(%ecx)
   181  	movl	%edx, 20(%ecx)
   182  
   183  	movl	(%ebx), %eax
   184  
   185  	mull	4(%ebx)		C src[0] * src[1]
   186  
   187  	movl	%eax, %esi
   188  	movl	%edx, %edi
   189  
   190  	movl	(%ebx), %eax
   191  
   192  	mull	8(%ebx)		C src[0] * src[2]
   193  
   194  	addl	%eax, %edi
   195  	movl	%edx, %ebp
   196  
   197  	adcl	$0, %ebp
   198  	movl	4(%ebx), %eax
   199  
   200  	mull	8(%ebx)		C src[1] * src[2]
   201  
   202  	xorl	%ebx, %ebx
   203  	addl	%eax, %ebp
   204  
   205  	C eax
   206  	C ebx	zero, will be dst[5]
   207  	C ecx	dst
   208  	C edx	dst[4]
   209  	C esi	dst[1]
   210  	C edi	dst[2]
   211  	C ebp	dst[3]
   212  
   213  	adcl	$0, %edx
   214  	addl	%esi, %esi
   215  
   216  	adcl	%edi, %edi
   217  
   218  	adcl	%ebp, %ebp
   219  
   220  	adcl	%edx, %edx
   221  	movl	4(%ecx), %eax
   222  
   223  	adcl	$0, %ebx
   224  	addl	%esi, %eax
   225  
   226  	movl	%eax, 4(%ecx)
   227  	movl	8(%ecx), %eax
   228  
   229  	adcl	%edi, %eax
   230  	movl	12(%ecx), %esi
   231  
   232  	adcl	%ebp, %esi
   233  	movl	16(%ecx), %edi
   234  
   235  	movl	%eax, 8(%ecx)
   236  	movl	%esi, 12(%ecx)
   237  
   238  	adcl	%edx, %edi
   239  	popl	%esi
   240  
   241  	movl	20(%ecx), %eax
   242  	movl	%edi, 16(%ecx)
   243  
   244  	popl	%edi
   245  	popl	%ebp
   246  
   247  	adcl	%ebx, %eax	C no carry out of this
   248  	popl	%ebx
   249  
   250  	movl	%eax, 20(%ecx)
   251  
   252  	ret
   253  
   254  
   255  C -----------------------------------------------------------------------------
   256  	ALIGN(8)
   257  L(four_or_more):
   258  	C eax	src low limb
   259  	C ebx	src
   260  	C ecx	dst
   261  	C edx	size
   262  	C esi
   263  	C edi
   264  	C ebp
   265  	C
   266  	C First multiply src[0]*src[1..size-1] and store at dst[1..size].
   267  
   268  deflit(`FRAME',4)
   269  
   270  	pushl	%edi
   271  FRAME_pushl()
   272  	pushl	%esi
   273  FRAME_pushl()
   274  
   275  	pushl	%ebp
   276  FRAME_pushl()
   277  	leal	(%ecx,%edx,4), %edi	C dst end of this mul1
   278  
   279  	leal	(%ebx,%edx,4), %esi	C src end
   280  	movl	%ebx, %ebp		C src
   281  
   282  	negl	%edx			C -size
   283  	xorl	%ebx, %ebx		C clear carry limb and carry flag
   284  
   285  	leal	1(%edx), %ecx		C -(size-1)
   286  
   287  L(mul1):
   288  	C eax	scratch
   289  	C ebx	carry
   290  	C ecx	counter, negative
   291  	C edx	scratch
   292  	C esi	&src[size]
   293  	C edi	&dst[size]
   294  	C ebp	src
   295  
   296  	adcl	$0, %ebx
   297  	movl	(%esi,%ecx,4), %eax
   298  
   299  	mull	(%ebp)
   300  
   301  	addl	%eax, %ebx
   302  
   303  	movl	%ebx, (%edi,%ecx,4)
   304  	incl	%ecx
   305  
   306  	movl	%edx, %ebx
   307  	jnz	L(mul1)
   308  
   309  
   310  	C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
   311  	C n=1..size-2.
   312  	C
   313  	C The last two products, which are the end corner of the product
   314  	C triangle, are handled separately to save looping overhead.  These
   315  	C are src[size-3]*src[size-2,size-1] and src[size-2]*src[size-1].
   316  	C If size is 4 then it's only these that need to be done.
   317  	C
   318  	C In the outer loop %esi is a constant, and %edi just advances by 1
   319  	C limb each time.  The size of the operation decreases by 1 limb
   320  	C each time.
   321  
   322  	C eax
   323  	C ebx	carry (needing carry flag added)
   324  	C ecx
   325  	C edx
   326  	C esi	&src[size]
   327  	C edi	&dst[size]
   328  	C ebp
   329  
   330  	adcl	$0, %ebx
   331  	movl	PARAM_SIZE, %edx
   332  
   333  	movl	%ebx, (%edi)
   334  	subl	$4, %edx
   335  
   336  	negl	%edx
   337  	jz	L(corner)
   338  
   339  
   340  L(outer):
   341  	C ebx	previous carry limb to store
   342  	C edx	outer loop counter (negative)
   343  	C esi	&src[size]
   344  	C edi	dst, pointing at stored carry limb of previous loop
   345  
   346  	pushl	%edx			C new outer loop counter
   347  	leal	-2(%edx), %ecx
   348  
   349  	movl	%ebx, (%edi)
   350  	addl	$4, %edi
   351  
   352  	addl	$4, %ebp
   353  	xorl	%ebx, %ebx		C initial carry limb, clear carry flag
   354  
   355  L(inner):
   356  	C eax	scratch
   357  	C ebx	carry (needing carry flag added)
   358  	C ecx	counter, negative
   359  	C edx	scratch
   360  	C esi	&src[size]
   361  	C edi	dst end of this addmul
   362  	C ebp	&src[j]
   363  
   364  	adcl	$0, %ebx
   365  	movl	(%esi,%ecx,4), %eax
   366  
   367  	mull	(%ebp)
   368  
   369  	addl	%ebx, %eax
   370  	movl	(%edi,%ecx,4), %ebx
   371  
   372  	adcl	$0, %edx
   373  	addl	%eax, %ebx
   374  
   375  	movl	%ebx, (%edi,%ecx,4)
   376  	incl	%ecx
   377  
   378  	movl	%edx, %ebx
   379  	jnz	L(inner)
   380  
   381  
   382  	adcl	$0, %ebx
   383  	popl	%edx		C outer loop counter
   384  
   385  	incl	%edx
   386  	jnz	L(outer)
   387  
   388  
   389  	movl	%ebx, (%edi)
   390  
   391  L(corner):
   392  	C esi	&src[size]
   393  	C edi	&dst[2*size-4]
   394  
   395  	movl	-8(%esi), %eax
   396  	movl	-4(%edi), %ebx		C risk of data cache bank clash here
   397  
   398  	mull	-12(%esi)		C src[size-2]*src[size-3]
   399  
   400  	addl	%eax, %ebx
   401  	movl	%edx, %ecx
   402  
   403  	adcl	$0, %ecx
   404  	movl	-4(%esi), %eax
   405  
   406  	mull	-12(%esi)		C src[size-1]*src[size-3]
   407  
   408  	addl	%ecx, %eax
   409  	movl	(%edi), %ecx
   410  
   411  	adcl	$0, %edx
   412  	movl	%ebx, -4(%edi)
   413  
   414  	addl	%eax, %ecx
   415  	movl	%edx, %ebx
   416  
   417  	adcl	$0, %ebx
   418  	movl	-4(%esi), %eax
   419  
   420  	mull	-8(%esi)		C src[size-1]*src[size-2]
   421  
   422  	movl	%ecx, (%edi)
   423  	addl	%eax, %ebx
   424  
   425  	adcl	$0, %edx
   426  	movl	PARAM_SIZE, %eax
   427  
   428  	negl	%eax
   429  	movl	%ebx, 4(%edi)
   430  
   431  	addl	$1, %eax		C -(size-1) and clear carry
   432  	movl	%edx, 8(%edi)
   433  
   434  
   435  C -----------------------------------------------------------------------------
   436  C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
   437  
   438  L(lshift):
   439  	C eax	counter, negative
   440  	C ebx	next limb
   441  	C ecx
   442  	C edx
   443  	C esi
   444  	C edi	&dst[2*size-4]
   445  	C ebp
   446  
   447  	movl	12(%edi,%eax,8), %ebx
   448  
   449  	rcll	%ebx
   450  	movl	16(%edi,%eax,8), %ecx
   451  
   452  	rcll	%ecx
   453  	movl	%ebx, 12(%edi,%eax,8)
   454  
   455  	movl	%ecx, 16(%edi,%eax,8)
   456  	incl	%eax
   457  
   458  	jnz	L(lshift)
   459  
   460  
   461  	adcl	%eax, %eax		C high bit out
   462  	movl	PARAM_SRC, %esi
   463  
   464  	movl	PARAM_SIZE, %ecx	C risk of cache bank clash
   465  	movl	%eax, 12(%edi)		C dst most significant limb
   466  
   467  
   468  C -----------------------------------------------------------------------------
   469  C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
   470  C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
   471  C low limb of src[0]^2.
   472  
   473  	movl	(%esi), %eax		C src[0]
   474  	leal	(%esi,%ecx,4), %esi	C src end
   475  
   476  	negl	%ecx
   477  
   478  	mull	%eax
   479  
   480  	movl	%eax, 16(%edi,%ecx,8)	C dst[0]
   481  	movl	%edx, %ebx
   482  
   483  	addl	$1, %ecx		C size-1 and clear carry
   484  
   485  L(diag):
   486  	C eax	scratch (low product)
   487  	C ebx	carry limb
   488  	C ecx	counter, negative
   489  	C edx	scratch (high product)
   490  	C esi	&src[size]
   491  	C edi	&dst[2*size-4]
   492  	C ebp	scratch (fetched dst limbs)
   493  
   494  	movl	(%esi,%ecx,4), %eax
   495  	adcl	$0, %ebx
   496  
   497  	mull	%eax
   498  
   499  	movl	16-4(%edi,%ecx,8), %ebp
   500  
   501  	addl	%ebp, %ebx
   502  	movl	16(%edi,%ecx,8), %ebp
   503  
   504  	adcl	%eax, %ebp
   505  	movl	%ebx, 16-4(%edi,%ecx,8)
   506  
   507  	movl	%ebp, 16(%edi,%ecx,8)
   508  	incl	%ecx
   509  
   510  	movl	%edx, %ebx
   511  	jnz	L(diag)
   512  
   513  
   514  	adcl	$0, %edx
   515  	movl	16-4(%edi), %eax	C dst most significant limb
   516  
   517  	addl	%eax, %edx
   518  	popl	%ebp
   519  
   520  	movl	%edx, 16-4(%edi)
   521  	popl	%esi		C risk of cache bank clash
   522  
   523  	popl	%edi
   524  	popl	%ebx
   525  
   526  	ret
   527  
   528  EPILOGUE()