github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium/mmx/mul_1.asm

github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium/mmx/mul_1.asm (about)

     1  dnl  Intel Pentium MMX mpn_mul_1 -- mpn by limb multiplication.
     2  
     3  dnl  Copyright 2000-2002 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C    cycles/limb
    35  C P5:   12.0   for 32-bit multiplier
    36  C        7.0   for 16-bit multiplier
    37  
    38  
    39  C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
    40  C                      mp_limb_t multiplier);
    41  C
    42  C When the multiplier is 16 bits some special case MMX code is used.  Small
    43  C multipliers might arise reasonably often from mpz_mul_ui etc.  If the size
    44  C is odd there's roughly a 5 cycle penalty, so times for say size==7 and
    45  C size==8 end up being quite close.  If src isn't aligned to an 8 byte
    46  C boundary then one limb is processed separately with roughly a 5 cycle
    47  C penalty, so in that case it's say size==8 and size==9 which are close.
    48  C
    49  C Alternatives:
    50  C
    51  C MMX is not believed to be of any use for 32-bit multipliers, since for
    52  C instance the current method would just have to be more or less duplicated
    53  C for the high and low halves of the multiplier, and would probably
    54  C therefore run at about 14 cycles, which is slower than the plain integer
    55  C at 12.
    56  C
    57  C Adding the high and low MMX products using integer code seems best.  An
    58  C attempt at using paddd and carry bit propagation with pcmpgtd didn't give
    59  C any joy.  Perhaps something could be done keeping the values signed and
    60  C thereby avoiding adjustments to make pcmpgtd into an unsigned compare, or
    61  C perhaps not.
    62  C
    63  C Future:
    64  C
    65  C An mpn_mul_1c entrypoint would need a double carry out of the low result
    66  C limb in the 16-bit code, unless it could be assumed the carry fits in 16
    67  C bits, possibly as carry<multiplier, this being true of a big calculation
    68  C done piece by piece.  But let's worry about that if/when mul_1c is
    69  C actually used.
    70  
    71  defframe(PARAM_MULTIPLIER,16)
    72  defframe(PARAM_SIZE,      12)
    73  defframe(PARAM_SRC,       8)
    74  defframe(PARAM_DST,       4)
    75  
    76  	TEXT
    77  
    78  	ALIGN(8)
    79  PROLOGUE(mpn_mul_1)
    80  deflit(`FRAME',0)
    81  
    82  	movl	PARAM_SIZE, %ecx
    83  	movl	PARAM_SRC, %edx
    84  
    85  	cmpl	$1, %ecx
    86  	jne	L(two_or_more)
    87  
    88  	C one limb only
    89  
    90  	movl	PARAM_MULTIPLIER, %eax
    91  	movl	PARAM_DST, %ecx
    92  
    93  	mull	(%edx)
    94  
    95  	movl	%eax, (%ecx)
    96  	movl	%edx, %eax
    97  
    98  	ret
    99  
   100  
   101  L(two_or_more):
   102  	C eax	size
   103  	C ebx
   104  	C ecx	carry
   105  	C edx
   106  	C esi	src
   107  	C edi
   108  	C ebp
   109  
   110  	pushl	%esi		FRAME_pushl()
   111  	pushl	%edi		FRAME_pushl()
   112  
   113  	movl	%edx, %esi		C src
   114  	movl	PARAM_DST, %edi
   115  
   116  	movl	PARAM_MULTIPLIER, %eax
   117  	pushl	%ebx		FRAME_pushl()
   118  
   119  	leal	(%esi,%ecx,4), %esi	C src end
   120  	leal	(%edi,%ecx,4), %edi	C dst end
   121  
   122  	negl	%ecx			C -size
   123  
   124  	pushl	%ebp		FRAME_pushl()
   125  	cmpl	$65536, %eax
   126  
   127  	jb	L(small)
   128  
   129  
   130  L(big):
   131  	xorl	%ebx, %ebx		C carry limb
   132  	sarl	%ecx			C -size/2
   133  
   134  	jnc	L(top)			C with carry flag clear
   135  
   136  
   137  	C size was odd, process one limb separately
   138  
   139  	mull	4(%esi,%ecx,8)		C m * src[0]
   140  
   141  	movl	%eax, 4(%edi,%ecx,8)
   142  	incl	%ecx
   143  
   144  	orl	%edx, %ebx		C carry limb, and clear carry flag
   145  
   146  
   147  L(top):
   148  	C eax
   149  	C ebx	carry
   150  	C ecx	counter, negative
   151  	C edx
   152  	C esi	src end
   153  	C edi	dst end
   154  	C ebp	(scratch carry)
   155  
   156  	adcl	$0, %ebx
   157  	movl	(%esi,%ecx,8), %eax
   158  
   159  	mull	PARAM_MULTIPLIER
   160  
   161  	movl	%edx, %ebp
   162  	addl	%eax, %ebx
   163  
   164  	adcl	$0, %ebp
   165  	movl	4(%esi,%ecx,8), %eax
   166  
   167  	mull	PARAM_MULTIPLIER
   168  
   169  	movl	%ebx, (%edi,%ecx,8)
   170  	addl	%ebp, %eax
   171  
   172  	movl	%eax, 4(%edi,%ecx,8)
   173  	incl	%ecx
   174  
   175  	movl	%edx, %ebx
   176  	jnz	L(top)
   177  
   178  
   179  	adcl	$0, %ebx
   180  	popl	%ebp
   181  
   182  	movl	%ebx, %eax
   183  	popl	%ebx
   184  
   185  	popl	%edi
   186  	popl	%esi
   187  
   188  	ret
   189  
   190  
   191  L(small):
   192  	C Special case for 16-bit multiplier.
   193  	C
   194  	C eax	multiplier
   195  	C ebx
   196  	C ecx	-size
   197  	C edx	src
   198  	C esi	src end
   199  	C edi	dst end
   200  	C ebp	multiplier
   201  
   202  	C size<3 not supported here.  At size==3 we're already a couple of
   203  	C cycles faster, so there's no threshold as such, just use the MMX
   204  	C as soon as possible.
   205  
   206  	cmpl	$-3, %ecx
   207  	ja	L(big)
   208  
   209  	movd	%eax, %mm7		C m
   210  	pxor	%mm6, %mm6		C initial carry word
   211  
   212  	punpcklwd %mm7, %mm7		C m replicated 2 times
   213  	addl	$2, %ecx		C -size+2
   214  
   215  	punpckldq %mm7, %mm7		C m replicated 4 times
   216  	andl	$4, %edx		C test alignment, clear carry flag
   217  
   218  	movq	%mm7, %mm0		C m
   219  	jz	L(small_entry)
   220  
   221  
   222  	C Source is unaligned, process one limb separately.
   223  	C
   224  	C Plain integer code is used here, since it's smaller and is about
   225  	C the same 13 cycles as an mmx block would be.
   226  	C
   227  	C An "addl $1,%ecx" doesn't clear the carry flag when size==3, hence
   228  	C the use of separate incl and orl.
   229  
   230  	mull	-8(%esi,%ecx,4)		C m * src[0]
   231  
   232  	movl	%eax, -8(%edi,%ecx,4)	C dst[0]
   233  	incl	%ecx			C one limb processed
   234  
   235  	movd	%edx, %mm6		C initial carry
   236  
   237  	orl	%eax, %eax		C clear carry flag
   238  	jmp	L(small_entry)
   239  
   240  
   241  C The scheduling here is quite tricky, since so many instructions have
   242  C pairing restrictions.  In particular the js won't pair with a movd, and
   243  C can't be paired with an adc since it wants flags from the inc, so
   244  C instructions are rotated to the top of the loop to find somewhere useful
   245  C for it.
   246  C
   247  C Trouble has been taken to avoid overlapping successive loop iterations,
   248  C since that would greatly increase the size of the startup and finishup
   249  C code.  Actually there's probably not much advantage to be had from
   250  C overlapping anyway, since the difficulties are mostly with pairing, not
   251  C with latencies as such.
   252  C
   253  C In the comments x represents the src data and m the multiplier (16
   254  C bits, but replicated 4 times).
   255  C
   256  C The m signs calculated in %mm3 are a loop invariant and could be held in
   257  C say %mm5, but that would save only one instruction and hence be no faster.
   258  
   259  L(small_top):
   260  	C eax	l.low, then l.high
   261  	C ebx	(h.low)
   262  	C ecx	counter, -size+2 to 0 or 1
   263  	C edx	(h.high)
   264  	C esi	&src[size]
   265  	C edi	&dst[size]
   266  	C ebp
   267  	C
   268  	C %mm0	(high products)
   269  	C %mm1	(low products)
   270  	C %mm2	(adjust for m using x signs)
   271  	C %mm3	(adjust for x using m signs)
   272  	C %mm4
   273  	C %mm5
   274  	C %mm6	h.low, then carry
   275  	C %mm7	m replicated 4 times
   276  
   277  	movd	%mm6, %ebx		C h.low
   278  	psrlq	$32, %mm1		C l.high
   279  
   280  	movd	%mm0, %edx		C h.high
   281  	movq	%mm0, %mm6		C new c
   282  
   283  	adcl	%eax, %ebx
   284  	incl	%ecx
   285  
   286  	movd	%mm1, %eax		C l.high
   287  	movq	%mm7, %mm0
   288  
   289  	adcl	%eax, %edx
   290  	movl	%ebx, -16(%edi,%ecx,4)
   291  
   292  	movl	%edx, -12(%edi,%ecx,4)
   293  	psrlq	$32, %mm6		C c
   294  
   295  L(small_entry):
   296  	pmulhw	-8(%esi,%ecx,4), %mm0	C h = (x*m).high
   297  	movq	%mm7, %mm1
   298  
   299  	pmullw	-8(%esi,%ecx,4), %mm1	C l = (x*m).low
   300  	movq	%mm7, %mm3
   301  
   302  	movq	-8(%esi,%ecx,4), %mm2	C x
   303  	psraw	$15, %mm3		C m signs
   304  
   305  	pand	-8(%esi,%ecx,4), %mm3	C x selected by m signs
   306  	psraw	$15, %mm2		C x signs
   307  
   308  	paddw	%mm3, %mm0		C add x to h if m neg
   309  	pand	%mm7, %mm2		C m selected by x signs
   310  
   311  	paddw	%mm2, %mm0		C add m to h if x neg
   312  	incl	%ecx
   313  
   314  	movd	%mm1, %eax		C l.low
   315  	punpcklwd %mm0, %mm6		C c + h.low << 16
   316  
   317  	psrlq	$16, %mm0		C h.high
   318  	js	L(small_top)
   319  
   320  
   321  
   322  
   323  	movd	%mm6, %ebx		C h.low
   324  	psrlq	$32, %mm1		C l.high
   325  
   326  	adcl	%eax, %ebx
   327  	popl	%ebp		FRAME_popl()
   328  
   329  	movd	%mm0, %edx		C h.high
   330  	psrlq	$32, %mm0		C l.high
   331  
   332  	movd	%mm1, %eax		C l.high
   333  
   334  	adcl	%eax, %edx
   335  	movl	%ebx, -12(%edi,%ecx,4)
   336  
   337  	movd	%mm0, %eax		C c
   338  
   339  	adcl	$0, %eax
   340  	movl	%edx, -8(%edi,%ecx,4)
   341  
   342  	orl	%ecx, %ecx
   343  	jnz	L(small_done)		C final %ecx==1 means even, ==0 odd
   344  
   345  
   346  	C Size odd, one extra limb to process.
   347  	C Plain integer code is used here, since it's smaller and is about
   348  	C the same speed as another mmx block would be.
   349  
   350  	movl	%eax, %ecx
   351  	movl	PARAM_MULTIPLIER, %eax
   352  
   353  	mull	-4(%esi)
   354  
   355  	addl	%ecx, %eax
   356  
   357  	adcl	$0, %edx
   358  	movl	%eax, -4(%edi)
   359  
   360  	movl	%edx, %eax
   361  L(small_done):
   362  	popl	%ebx
   363  
   364  	popl	%edi
   365  	popl	%esi
   366  
   367  	emms
   368  
   369  	ret
   370  
   371  EPILOGUE()