github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/sqr_basecase.asm (about)

     1  dnl  x86 generic mpn_sqr_basecase -- square an mpn number.
     2  
     3  dnl  Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  
    32  include(`../config.m4')
    33  
    34  
    35  C     cycles/crossproduct  cycles/triangleproduct
    36  C P5
    37  C P6
    38  C K6
    39  C K7
    40  C P4
    41  
    42  
    43  C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
    44  C
    45  C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
    46  C lot of function call overheads are avoided, especially when the size is
    47  C small.
    48  C
    49  C The mul1 loop is not unrolled like mul_1.asm, it doesn't seem worth the
    50  C code size to do so here.
    51  C
    52  C Enhancements:
    53  C
    54  C The addmul loop here is also not unrolled like aorsmul_1.asm and
    55  C mul_basecase.asm are.  Perhaps it should be done.  It'd add to the
    56  C complexity, but if it's worth doing in the other places then it should be
    57  C worthwhile here.
    58  C
    59  C A fully-unrolled style like other sqr_basecase.asm versions (k6, k7, p6)
    60  C might be worth considering.  That'd add quite a bit to the code size, but
    61  C only as much as is used would be dragged into L1 cache.
    62  
    63  defframe(PARAM_SIZE,12)
    64  defframe(PARAM_SRC, 8)
    65  defframe(PARAM_DST, 4)
    66  
    67  	TEXT
    68  	ALIGN(8)
    69  PROLOGUE(mpn_sqr_basecase)
    70  deflit(`FRAME',0)
    71  
    72  	movl	PARAM_SIZE, %edx
    73  
    74  	movl	PARAM_SRC, %eax
    75  
    76  	cmpl	$2, %edx
    77  	movl	PARAM_DST, %ecx
    78  
    79  	je	L(two_limbs)
    80  	ja	L(three_or_more)
    81  
    82  
    83  C -----------------------------------------------------------------------------
    84  C one limb only
    85  	C eax	src
    86  	C ebx
    87  	C ecx	dst
    88  	C edx
    89  
    90  	movl	(%eax), %eax
    91  	mull	%eax
    92  	movl	%eax, (%ecx)
    93  	movl	%edx, 4(%ecx)
    94  	ret
    95  
    96  
    97  C -----------------------------------------------------------------------------
    98  	ALIGN(8)
    99  L(two_limbs):
   100  	C eax	src
   101  	C ebx
   102  	C ecx	dst
   103  	C edx
   104  
   105  	pushl	%ebx
   106  	pushl	%ebp
   107  
   108  	movl	%eax, %ebx
   109  	movl	(%eax), %eax
   110  
   111  	mull	%eax		C src[0]^2
   112  
   113  	pushl	%esi
   114  	pushl	%edi
   115  
   116  	movl	%edx, %esi	C dst[1]
   117  	movl	%eax, (%ecx)	C dst[0]
   118  
   119  	movl	4(%ebx), %eax
   120  	mull	%eax		C src[1]^2
   121  
   122  	movl	%eax, %edi	C dst[2]
   123  	movl	%edx, %ebp	C dst[3]
   124  
   125  	movl	(%ebx), %eax
   126  	mull	4(%ebx)		C src[0]*src[1]
   127  
   128  	addl	%eax, %esi
   129  
   130  	adcl	%edx, %edi
   131  
   132  	adcl	$0, %ebp
   133  	addl	%esi, %eax
   134  
   135  	adcl	%edi, %edx
   136  	movl	%eax, 4(%ecx)
   137  
   138  	adcl	$0, %ebp
   139  
   140  	movl	%edx, 8(%ecx)
   141  	movl	%ebp, 12(%ecx)
   142  
   143  	popl	%edi
   144  	popl	%esi
   145  
   146  	popl	%ebp
   147  	popl	%ebx
   148  
   149  	ret
   150  
   151  
   152  C -----------------------------------------------------------------------------
   153  	ALIGN(8)
   154  L(three_or_more):
   155  deflit(`FRAME',0)
   156  	C eax	src
   157  	C ebx
   158  	C ecx	dst
   159  	C edx	size
   160  
   161  	pushl	%ebx	FRAME_pushl()
   162  	pushl	%edi	FRAME_pushl()
   163  
   164  	pushl	%esi	FRAME_pushl()
   165  	pushl	%ebp	FRAME_pushl()
   166  
   167  	leal	(%ecx,%edx,4), %edi	C &dst[size], end of this mul1
   168  	leal	(%eax,%edx,4), %esi	C &src[size]
   169  
   170  C First multiply src[0]*src[1..size-1] and store at dst[1..size].
   171  
   172  	movl	(%eax), %ebp		C src[0], multiplier
   173  	movl	%edx, %ecx
   174  
   175  	negl	%ecx			C -size
   176  	xorl	%ebx, %ebx		C clear carry limb
   177  
   178  	incl	%ecx			C -(size-1)
   179  
   180  L(mul1):
   181  	C eax	scratch
   182  	C ebx	carry
   183  	C ecx	counter, limbs, negative
   184  	C edx	scratch
   185  	C esi	&src[size]
   186  	C edi	&dst[size]
   187  	C ebp	multiplier
   188  
   189  	movl	(%esi,%ecx,4), %eax
   190  	mull	%ebp
   191  	addl	%eax, %ebx
   192  	adcl	$0, %edx
   193  	movl	%ebx, (%edi,%ecx,4)
   194  	movl	%edx, %ebx
   195  	incl	%ecx
   196  	jnz	L(mul1)
   197  
   198  	movl	%ebx, (%edi)
   199  
   200  
   201  	C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
   202  	C n=1..size-2.
   203  	C
   204  	C The last products src[size-2]*src[size-1], which is the end corner
   205  	C of the product triangle, is handled separately at the end to save
   206  	C looping overhead.  If size is 3 then it's only this that needs to
   207  	C be done.
   208  	C
   209  	C In the outer loop %esi is a constant, and %edi just advances by 1
   210  	C limb each time.  The size of the operation decreases by 1 limb
   211  	C each time.
   212  
   213  	C eax
   214  	C ebx	carry (needing carry flag added)
   215  	C ecx
   216  	C edx
   217  	C esi	&src[size]
   218  	C edi	&dst[size]
   219  	C ebp
   220  
   221  	movl	PARAM_SIZE, %ecx
   222  	subl	$3, %ecx
   223  	jz	L(corner)
   224  
   225  	negl	%ecx
   226  
   227  dnl  re-use parameter space
   228  define(VAR_OUTER,`PARAM_DST')
   229  
   230  L(outer):
   231  	C eax
   232  	C ebx
   233  	C ecx
   234  	C edx	outer loop counter, -(size-3) to -1
   235  	C esi	&src[size]
   236  	C edi	dst, pointing at stored carry limb of previous loop
   237  	C ebp
   238  
   239  	movl	%ecx, VAR_OUTER
   240  	addl	$4, %edi		C advance dst end
   241  
   242  	movl	-8(%esi,%ecx,4), %ebp	C next multiplier
   243  	subl	$1, %ecx
   244  
   245  	xorl	%ebx, %ebx		C initial carry limb
   246  
   247  L(inner):
   248  	C eax	scratch
   249  	C ebx	carry (needing carry flag added)
   250  	C ecx	counter, -n-1 to -1
   251  	C edx	scratch
   252  	C esi	&src[size]
   253  	C edi	dst end of this addmul
   254  	C ebp	multiplier
   255  
   256  	movl	(%esi,%ecx,4), %eax
   257  	mull	%ebp
   258  	addl	%ebx, %eax
   259  	adcl	$0, %edx
   260  	addl	%eax, (%edi,%ecx,4)
   261  	adcl	$0, %edx
   262  	movl	%edx, %ebx
   263  	addl	$1, %ecx
   264  	jl	L(inner)
   265  
   266  
   267  	movl	%ebx, (%edi)
   268  	movl	VAR_OUTER, %ecx
   269  	incl	%ecx
   270  	jnz	L(outer)
   271  
   272  
   273  L(corner):
   274  	C esi	&src[size]
   275  	C edi	&dst[2*size-3]
   276  
   277  	movl	-4(%esi), %eax
   278  	mull	-8(%esi)		C src[size-1]*src[size-2]
   279  	addl	%eax, 0(%edi)
   280  	adcl	$0, %edx
   281  	movl	%edx, 4(%edi)		C dst high limb
   282  
   283  
   284  C -----------------------------------------------------------------------------
   285  C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
   286  
   287  	movl	PARAM_SIZE, %eax
   288  	negl	%eax
   289  	addl	$1, %eax		C -(size-1) and clear carry
   290  
   291  L(lshift):
   292  	C eax	counter, negative
   293  	C ebx	next limb
   294  	C ecx
   295  	C edx
   296  	C esi
   297  	C edi	&dst[2*size-4]
   298  	C ebp
   299  
   300  	rcll	8(%edi,%eax,8)
   301  	rcll	12(%edi,%eax,8)
   302  	incl	%eax
   303  	jnz	L(lshift)
   304  
   305  
   306  	adcl	%eax, %eax		C high bit out
   307  	movl	%eax, 8(%edi)		C dst most significant limb
   308  
   309  
   310  C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
   311  C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
   312  C low limb of src[0]^2.
   313  
   314  	movl	PARAM_SRC, %esi
   315  	movl	(%esi), %eax		C src[0]
   316  	mull	%eax			C src[0]^2
   317  
   318  	movl	PARAM_SIZE, %ecx
   319  	leal	(%esi,%ecx,4), %esi	C src end
   320  
   321  	negl	%ecx			C -size
   322  	movl	%edx, %ebx		C initial carry
   323  
   324  	movl	%eax, 12(%edi,%ecx,8)	C dst[0]
   325  	incl	%ecx			C -(size-1)
   326  
   327  L(diag):
   328  	C eax	scratch (low product)
   329  	C ebx	carry limb
   330  	C ecx	counter, -(size-1) to -1
   331  	C edx	scratch (high product)
   332  	C esi	&src[size]
   333  	C edi	&dst[2*size-3]
   334  	C ebp	scratch (fetched dst limbs)
   335  
   336  	movl	(%esi,%ecx,4), %eax
   337  	mull	%eax
   338  
   339  	addl	%ebx, 8(%edi,%ecx,8)
   340  	movl	%edx, %ebx
   341  
   342  	adcl	%eax, 12(%edi,%ecx,8)
   343  	adcl	$0, %ebx
   344  
   345  	incl	%ecx
   346  	jnz	L(diag)
   347  
   348  
   349  	addl	%ebx, 8(%edi)		C dst most significant limb
   350  
   351  	popl	%ebp
   352  	popl	%esi
   353  
   354  	popl	%edi
   355  	popl	%ebx
   356  
   357  	ret
   358  
   359  EPILOGUE()