github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/atom/sse2/mul_basecase.asm (about)

     1  dnl  x86 mpn_mul_basecase -- Multiply two limb vectors and store the result in
     2  dnl  a third limb vector.
     3  
     4  dnl  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
     5  
     6  dnl  Copyright 2011 Free Software Foundation, Inc.
     7  
     8  dnl  This file is part of the GNU MP Library.
     9  dnl
    10  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    11  dnl  it under the terms of either:
    12  dnl
    13  dnl    * the GNU Lesser General Public License as published by the Free
    14  dnl      Software Foundation; either version 3 of the License, or (at your
    15  dnl      option) any later version.
    16  dnl
    17  dnl  or
    18  dnl
    19  dnl    * the GNU General Public License as published by the Free Software
    20  dnl      Foundation; either version 2 of the License, or (at your option) any
    21  dnl      later version.
    22  dnl
    23  dnl  or both in parallel, as here.
    24  dnl
    25  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    26  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    27  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    28  dnl  for more details.
    29  dnl
    30  dnl  You should have received copies of the GNU General Public License and the
    31  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    32  dnl  see https://www.gnu.org/licenses/.
    33  
    34  include(`../config.m4')
    35  
    36  C TODO
    37  C  * Check if 'jmp N(%esp)' is well-predicted enough to allow us to combine the
    38  C    4 large loops into one; we could use it for the outer loop branch.
    39  C  * Optimise code outside of inner loops.
    40  C  * Write combined addmul_1 feed-in a wind-down code, and use when iterating
    41  C    outer each loop.  ("Overlapping software pipelining")
    42  C  * Postpone push of ebx until we know vn > 1.  Perhaps use caller-saves regs
    43  C    for inlined mul_1, allowing us to postpone all pushes.
    44  C  * Perhaps write special code for vn <= un < M, for some small M.
    45  
    46  C void mpn_mul_basecase (mp_ptr wp,
    47  C                        mp_srcptr xp, mp_size_t xn,
    48  C                        mp_srcptr yp, mp_size_t yn);
    49  C
    50  
    51  define(`rp',  `%edi')
    52  define(`up',  `%esi')
    53  define(`un',  `%ecx')
    54  define(`vp',  `%ebp')
    55  define(`vn',  `36(%esp)')
    56  
    57  	TEXT
    58  	ALIGN(16)
    59  PROLOGUE(mpn_mul_basecase)
    60  	push	%edi
    61  	push	%esi
    62  	push	%ebx
    63  	push	%ebp
    64  	mov	20(%esp), rp
    65  	mov	24(%esp), up
    66  	mov	28(%esp), un
    67  	mov	32(%esp), vp
    68  
    69  	movd	(up), %mm0
    70  	movd	(vp), %mm7
    71  	pmuludq	%mm7, %mm0
    72  	pxor	%mm6, %mm6
    73  
    74  	mov	un, %eax
    75  	and	$3, %eax
    76  	jz	L(of0)
    77  	cmp	$2, %eax
    78  	jc	L(of1)
    79  	jz	L(of2)
    80  
    81  C ================================================================
    82  	jmp	L(m3)
    83  	ALIGN(16)
    84  L(lm3):	movd	-4(up), %mm0
    85  	pmuludq	%mm7, %mm0
    86  	psrlq	$32, %mm6
    87  	lea	16(rp), rp
    88  	paddq	%mm0, %mm6
    89  	movd	(up), %mm0
    90  	pmuludq	%mm7, %mm0
    91  	movd	%mm6, -4(rp)
    92  	psrlq	$32, %mm6
    93  L(m3):	paddq	%mm0, %mm6
    94  	movd	4(up), %mm0
    95  	pmuludq	%mm7, %mm0
    96  	movd	%mm6, (rp)
    97  	psrlq	$32, %mm6
    98  	paddq	%mm0, %mm6
    99  	movd	8(up), %mm0
   100  	pmuludq	%mm7, %mm0
   101  	movd	%mm6, 4(rp)
   102  	psrlq	$32, %mm6
   103  	paddq	%mm0, %mm6
   104  	sub	$4, un
   105  	movd	%mm6, 8(rp)
   106  	lea	16(up), up
   107  	ja	L(lm3)
   108  
   109  	psrlq	$32, %mm6
   110  	movd	%mm6, 12(rp)
   111  
   112  	decl	vn
   113  	jz	L(done)
   114  	lea	-8(rp), rp
   115  
   116  L(ol3):	mov	28(%esp), un
   117  	neg	un
   118  	lea	4(vp), vp
   119  	movd	(vp), %mm7	C read next V limb
   120  	mov	24(%esp), up
   121  	lea	16(rp,un,4), rp
   122  
   123  	movd	(up), %mm0
   124  	pmuludq	%mm7, %mm0
   125  	sar	$2, un
   126  	movd	4(up), %mm1
   127  	movd	%mm0, %ebx
   128  	pmuludq	%mm7, %mm1
   129  	lea	-8(up), up
   130  	xor	%edx, %edx	C zero edx and CF
   131  	jmp	L(a3)
   132  
   133  L(la3):	movd	4(up), %mm1
   134  	adc	$0, %edx
   135  	add	%eax, 12(rp)
   136  	movd	%mm0, %ebx
   137  	pmuludq	%mm7, %mm1
   138  	lea	16(rp), rp
   139  	psrlq	$32, %mm0
   140  	adc	%edx, %ebx
   141  	movd	%mm0, %edx
   142  	movd	%mm1, %eax
   143  	movd	8(up), %mm0
   144  	pmuludq	%mm7, %mm0
   145  	adc	$0, %edx
   146  	add	%ebx, (rp)
   147  	psrlq	$32, %mm1
   148  	adc	%edx, %eax
   149  	movd	%mm1, %edx
   150  	movd	%mm0, %ebx
   151  	movd	12(up), %mm1
   152  	pmuludq	%mm7, %mm1
   153  	adc	$0, %edx
   154  	add	%eax, 4(rp)
   155  L(a3):	psrlq	$32, %mm0
   156  	adc	%edx, %ebx
   157  	movd	%mm0, %edx
   158  	movd	%mm1, %eax
   159  	lea	16(up), up
   160  	movd	(up), %mm0
   161  	adc	$0, %edx
   162  	add	%ebx, 8(rp)
   163  	psrlq	$32, %mm1
   164  	adc	%edx, %eax
   165  	movd	%mm1, %edx
   166  	pmuludq	%mm7, %mm0
   167  	inc	un
   168  	jnz	L(la3)
   169  
   170  	adc	un, %edx	C un is zero here
   171  	add	%eax, 12(rp)
   172  	movd	%mm0, %ebx
   173  	psrlq	$32, %mm0
   174  	adc	%edx, %ebx
   175  	movd	%mm0, %eax
   176  	adc	un, %eax
   177  	add	%ebx, 16(rp)
   178  	adc	un, %eax
   179  	mov	%eax, 20(rp)
   180  
   181  	decl	vn
   182  	jnz	L(ol3)
   183  	jmp	L(done)
   184  
   185  C ================================================================
   186  	ALIGN(16)
   187  L(lm0):	movd	(up), %mm0
   188  	pmuludq	%mm7, %mm0
   189  	psrlq	$32, %mm6
   190  	lea	16(rp), rp
   191  L(of0):	paddq	%mm0, %mm6
   192  	movd	4(up), %mm0
   193  	pmuludq	%mm7, %mm0
   194  	movd	%mm6, (rp)
   195  	psrlq	$32, %mm6
   196  	paddq	%mm0, %mm6
   197  	movd	8(up), %mm0
   198  	pmuludq	%mm7, %mm0
   199  	movd	%mm6, 4(rp)
   200  	psrlq	$32, %mm6
   201  	paddq	%mm0, %mm6
   202  	movd	12(up), %mm0
   203  	pmuludq	%mm7, %mm0
   204  	movd	%mm6, 8(rp)
   205  	psrlq	$32, %mm6
   206  	paddq	%mm0, %mm6
   207  	sub	$4, un
   208  	movd	%mm6, 12(rp)
   209  	lea	16(up), up
   210  	ja	L(lm0)
   211  
   212  	psrlq	$32, %mm6
   213  	movd	%mm6, 16(rp)
   214  
   215  	decl	vn
   216  	jz	L(done)
   217  	lea	-4(rp), rp
   218  
   219  L(ol0):	mov	28(%esp), un
   220  	neg	un
   221  	lea	4(vp), vp
   222  	movd	(vp), %mm7	C read next V limb
   223  	mov	24(%esp), up
   224  	lea	20(rp,un,4), rp
   225  
   226  	movd	(up), %mm1
   227  	pmuludq	%mm7, %mm1
   228  	sar	$2, un
   229  	movd	4(up), %mm0
   230  	lea	-4(up), up
   231  	movd	%mm1, %eax
   232  	pmuludq	%mm7, %mm0
   233  	xor	%edx, %edx	C zero edx and CF
   234  	jmp	L(a0)
   235  
   236  L(la0):	movd	4(up), %mm1
   237  	adc	$0, %edx
   238  	add	%eax, 12(rp)
   239  	movd	%mm0, %ebx
   240  	pmuludq	%mm7, %mm1
   241  	lea	16(rp), rp
   242  	psrlq	$32, %mm0
   243  	adc	%edx, %ebx
   244  	movd	%mm0, %edx
   245  	movd	%mm1, %eax
   246  	movd	8(up), %mm0
   247  	pmuludq	%mm7, %mm0
   248  	adc	$0, %edx
   249  	add	%ebx, (rp)
   250  L(a0):	psrlq	$32, %mm1
   251  	adc	%edx, %eax
   252  	movd	%mm1, %edx
   253  	movd	%mm0, %ebx
   254  	movd	12(up), %mm1
   255  	pmuludq	%mm7, %mm1
   256  	adc	$0, %edx
   257  	add	%eax, 4(rp)
   258  	psrlq	$32, %mm0
   259  	adc	%edx, %ebx
   260  	movd	%mm0, %edx
   261  	movd	%mm1, %eax
   262  	lea	16(up), up
   263  	movd	(up), %mm0
   264  	adc	$0, %edx
   265  	add	%ebx, 8(rp)
   266  	psrlq	$32, %mm1
   267  	adc	%edx, %eax
   268  	movd	%mm1, %edx
   269  	pmuludq	%mm7, %mm0
   270  	inc	un
   271  	jnz	L(la0)
   272  
   273  	adc	un, %edx	C un is zero here
   274  	add	%eax, 12(rp)
   275  	movd	%mm0, %ebx
   276  	psrlq	$32, %mm0
   277  	adc	%edx, %ebx
   278  	movd	%mm0, %eax
   279  	adc	un, %eax
   280  	add	%ebx, 16(rp)
   281  	adc	un, %eax
   282  	mov	%eax, 20(rp)
   283  
   284  	decl	vn
   285  	jnz	L(ol0)
   286  	jmp	L(done)
   287  
   288  C ================================================================
   289  	ALIGN(16)
   290  L(lm1):	movd	-12(up), %mm0
   291  	pmuludq	%mm7, %mm0
   292  	psrlq	$32, %mm6
   293  	lea	16(rp), rp
   294  	paddq	%mm0, %mm6
   295  	movd	-8(up), %mm0
   296  	pmuludq	%mm7, %mm0
   297  	movd	%mm6, -12(rp)
   298  	psrlq	$32, %mm6
   299  	paddq	%mm0, %mm6
   300  	movd	-4(up), %mm0
   301  	pmuludq	%mm7, %mm0
   302  	movd	%mm6, -8(rp)
   303  	psrlq	$32, %mm6
   304  	paddq	%mm0, %mm6
   305  	movd	(up), %mm0
   306  	pmuludq	%mm7, %mm0
   307  	movd	%mm6, -4(rp)
   308  	psrlq	$32, %mm6
   309  L(of1):	paddq	%mm0, %mm6
   310  	sub	$4, un
   311  	movd	%mm6, (rp)
   312  	lea	16(up), up
   313  	ja	L(lm1)
   314  
   315  	psrlq	$32, %mm6
   316  	movd	%mm6, 4(rp)
   317  
   318  	decl	vn
   319  	jz	L(done)
   320  	lea	-16(rp), rp
   321  
   322  L(ol1):	mov	28(%esp), un
   323  	neg	un
   324  	lea	4(vp), vp
   325  	movd	(vp), %mm7	C read next V limb
   326  	mov	24(%esp), up
   327  	lea	24(rp,un,4), rp
   328  
   329  	movd	(up), %mm0
   330  	pmuludq	%mm7, %mm0
   331  	sar	$2, un
   332  	movd	%mm0, %ebx
   333  	movd	4(up), %mm1
   334  	pmuludq	%mm7, %mm1
   335  	xor	%edx, %edx	C zero edx and CF
   336  	inc	un
   337  	jmp	L(a1)
   338  
   339  L(la1):	movd	4(up), %mm1
   340  	adc	$0, %edx
   341  	add	%eax, 12(rp)
   342  	movd	%mm0, %ebx
   343  	pmuludq	%mm7, %mm1
   344  	lea	16(rp), rp
   345  L(a1):	psrlq	$32, %mm0
   346  	adc	%edx, %ebx
   347  	movd	%mm0, %edx
   348  	movd	%mm1, %eax
   349  	movd	8(up), %mm0
   350  	pmuludq	%mm7, %mm0
   351  	adc	$0, %edx
   352  	add	%ebx, (rp)
   353  	psrlq	$32, %mm1
   354  	adc	%edx, %eax
   355  	movd	%mm1, %edx
   356  	movd	%mm0, %ebx
   357  	movd	12(up), %mm1
   358  	pmuludq	%mm7, %mm1
   359  	adc	$0, %edx
   360  	add	%eax, 4(rp)
   361  	psrlq	$32, %mm0
   362  	adc	%edx, %ebx
   363  	movd	%mm0, %edx
   364  	movd	%mm1, %eax
   365  	lea	16(up), up
   366  	movd	(up), %mm0
   367  	adc	$0, %edx
   368  	add	%ebx, 8(rp)
   369  	psrlq	$32, %mm1
   370  	adc	%edx, %eax
   371  	movd	%mm1, %edx
   372  	pmuludq	%mm7, %mm0
   373  	inc	un
   374  	jnz	L(la1)
   375  
   376  	adc	un, %edx	C un is zero here
   377  	add	%eax, 12(rp)
   378  	movd	%mm0, %ebx
   379  	psrlq	$32, %mm0
   380  	adc	%edx, %ebx
   381  	movd	%mm0, %eax
   382  	adc	un, %eax
   383  	add	%ebx, 16(rp)
   384  	adc	un, %eax
   385  	mov	%eax, 20(rp)
   386  
   387  	decl	vn
   388  	jnz	L(ol1)
   389  	jmp	L(done)
   390  
   391  C ================================================================
   392  	ALIGN(16)
   393  L(lm2):	movd	-8(up), %mm0
   394  	pmuludq	%mm7, %mm0
   395  	psrlq	$32, %mm6
   396  	lea	16(rp), rp
   397  	paddq	%mm0, %mm6
   398  	movd	-4(up), %mm0
   399  	pmuludq	%mm7, %mm0
   400  	movd	%mm6, -8(rp)
   401  	psrlq	$32, %mm6
   402  	paddq	%mm0, %mm6
   403  	movd	(up), %mm0
   404  	pmuludq	%mm7, %mm0
   405  	movd	%mm6, -4(rp)
   406  	psrlq	$32, %mm6
   407  L(of2):	paddq	%mm0, %mm6
   408  	movd	4(up), %mm0
   409  	pmuludq	%mm7, %mm0
   410  	movd	%mm6, (rp)
   411  	psrlq	$32, %mm6
   412  	paddq	%mm0, %mm6
   413  	sub	$4, un
   414  	movd	%mm6, 4(rp)
   415  	lea	16(up), up
   416  	ja	L(lm2)
   417  
   418  	psrlq	$32, %mm6
   419  	movd	%mm6, 8(rp)
   420  
   421  	decl	vn
   422  	jz	L(done)
   423  	lea	-12(rp), rp
   424  
   425  L(ol2):	mov	28(%esp), un
   426  	neg	un
   427  	lea	4(vp), vp
   428  	movd	(vp), %mm7	C read next V limb
   429  	mov	24(%esp), up
   430  	lea	12(rp,un,4), rp
   431  
   432  	movd	(up), %mm1
   433  	pmuludq	%mm7, %mm1
   434  	sar	$2, un
   435  	movd	4(up), %mm0
   436  	lea	4(up), up
   437  	movd	%mm1, %eax
   438  	xor	%edx, %edx	C zero edx and CF
   439  	jmp	L(lo2)
   440  
   441  L(la2):	movd	4(up), %mm1
   442  	adc	$0, %edx
   443  	add	%eax, 12(rp)
   444  	movd	%mm0, %ebx
   445  	pmuludq	%mm7, %mm1
   446  	lea	16(rp), rp
   447  	psrlq	$32, %mm0
   448  	adc	%edx, %ebx
   449  	movd	%mm0, %edx
   450  	movd	%mm1, %eax
   451  	movd	8(up), %mm0
   452  	pmuludq	%mm7, %mm0
   453  	adc	$0, %edx
   454  	add	%ebx, (rp)
   455  	psrlq	$32, %mm1
   456  	adc	%edx, %eax
   457  	movd	%mm1, %edx
   458  	movd	%mm0, %ebx
   459  	movd	12(up), %mm1
   460  	pmuludq	%mm7, %mm1
   461  	adc	$0, %edx
   462  	add	%eax, 4(rp)
   463  	psrlq	$32, %mm0
   464  	adc	%edx, %ebx
   465  	movd	%mm0, %edx
   466  	movd	%mm1, %eax
   467  	lea	16(up), up
   468  	movd	(up), %mm0
   469  	adc	$0, %edx
   470  	add	%ebx, 8(rp)
   471  L(lo2):	psrlq	$32, %mm1
   472  	adc	%edx, %eax
   473  	movd	%mm1, %edx
   474  	pmuludq	%mm7, %mm0
   475  	inc	un
   476  	jnz	L(la2)
   477  
   478  	adc	un, %edx	C un is zero here
   479  	add	%eax, 12(rp)
   480  	movd	%mm0, %ebx
   481  	psrlq	$32, %mm0
   482  	adc	%edx, %ebx
   483  	movd	%mm0, %eax
   484  	adc	un, %eax
   485  	add	%ebx, 16(rp)
   486  	adc	un, %eax
   487  	mov	%eax, 20(rp)
   488  
   489  	decl	vn
   490  	jnz	L(ol2)
   491  C	jmp	L(done)
   492  
   493  C ================================================================
   494  L(done):
   495  	emms
   496  	pop	%ebp
   497  	pop	%ebx
   498  	pop	%esi
   499  	pop	%edi
   500  	ret
   501  EPILOGUE()