github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/atom/sse2/sqr_basecase.asm (about)

     1  dnl  x86 mpn_sqr_basecase -- square an mpn number, optimised for atom.
     2  
     3  dnl  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
     4  
     5  dnl  Copyright 2011 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  C TODO
    36  C  * Check if 'jmp N(%esp)' is well-predicted enough to allow us to combine the
    37  C    4 large loops into one; we could use it for the outer loop branch.
    38  C  * Optimise code outside of inner loops.
    39  C  * Write combined addmul_1 feed-in a wind-down code, and use when iterating
    40  C    outer each loop.  ("Overlapping software pipelining")
    41  C  * Perhaps use caller-saves regs for inlined mul_1, allowing us to postpone
    42  C    all pushes.
    43  C  * Perhaps write special code for n < M, for some small M.
    44  C  * Replace inlined addmul_1 with smaller code from aorsmul_1.asm, or perhaps
    45  C    with even less pipelined code.
    46  C  * We run the outer loop until we have a 2-limb by 1-limb addmul_1 left.
    47  C    Consider breaking out earlier, saving high the cost of short loops.
    48  
    49  C void mpn_sqr_basecase (mp_ptr wp,
    50  C                        mp_srcptr xp, mp_size_t xn);
    51  
    52  define(`rp',  `%edi')
    53  define(`up',  `%esi')
    54  define(`n',   `%ecx')
    55  
    56  define(`un',  `%ebp')
    57  
    58  	TEXT
    59  	ALIGN(16)
    60  PROLOGUE(mpn_sqr_basecase)
    61  	push	%edi
    62  	push	%esi
    63  	mov	12(%esp), rp
    64  	mov	16(%esp), up
    65  	mov	20(%esp), n
    66  
    67  	lea	4(rp), rp	C write triangular product starting at rp[1]
    68  	dec	n
    69  	movd	(up), %mm7
    70  
    71  	jz	L(one)
    72  	lea	4(up), up
    73  	push	%ebx
    74  	push	%ebp
    75  	mov	n, %eax
    76  
    77  	movd	(up), %mm0
    78  	neg	n
    79  	pmuludq	%mm7, %mm0
    80  	pxor	%mm6, %mm6
    81  	mov	n, un
    82  
    83  	and	$3, %eax
    84  	jz	L(of0)
    85  	cmp	$2, %eax
    86  	jc	L(of1)
    87  	jz	L(of2)
    88  
    89  C ================================================================
    90  	jmp	L(m3)
    91  	ALIGN(16)
    92  L(lm3):	movd	-4(up), %mm0
    93  	pmuludq	%mm7, %mm0
    94  	psrlq	$32, %mm6
    95  	lea	16(rp), rp
    96  	paddq	%mm0, %mm6
    97  	movd	(up), %mm0
    98  	pmuludq	%mm7, %mm0
    99  	movd	%mm6, -4(rp)
   100  	psrlq	$32, %mm6
   101  L(m3):	paddq	%mm0, %mm6
   102  	movd	4(up), %mm0
   103  	pmuludq	%mm7, %mm0
   104  	movd	%mm6, (rp)
   105  	psrlq	$32, %mm6
   106  	paddq	%mm0, %mm6
   107  	movd	8(up), %mm0
   108  	pmuludq	%mm7, %mm0
   109  	movd	%mm6, 4(rp)
   110  	psrlq	$32, %mm6
   111  	paddq	%mm0, %mm6
   112  	add	$4, un
   113  	movd	%mm6, 8(rp)
   114  	lea	16(up), up
   115  	js	L(lm3)
   116  
   117  	psrlq	$32, %mm6
   118  	movd	%mm6, 12(rp)
   119  
   120  	inc	n
   121  C	jz	L(done)
   122    lea	-12(up), up
   123    lea	4(rp), rp
   124  	jmp	L(ol2)
   125  
   126  C ================================================================
   127  	ALIGN(16)
   128  L(lm0):	movd	(up), %mm0
   129  	pmuludq	%mm7, %mm0
   130  	psrlq	$32, %mm6
   131  	lea	16(rp), rp
   132  L(of0):	paddq	%mm0, %mm6
   133  	movd	4(up), %mm0
   134  	pmuludq	%mm7, %mm0
   135  	movd	%mm6, (rp)
   136  	psrlq	$32, %mm6
   137  	paddq	%mm0, %mm6
   138  	movd	8(up), %mm0
   139  	pmuludq	%mm7, %mm0
   140  	movd	%mm6, 4(rp)
   141  	psrlq	$32, %mm6
   142  	paddq	%mm0, %mm6
   143  	movd	12(up), %mm0
   144  	pmuludq	%mm7, %mm0
   145  	movd	%mm6, 8(rp)
   146  	psrlq	$32, %mm6
   147  	paddq	%mm0, %mm6
   148  	add	$4, un
   149  	movd	%mm6, 12(rp)
   150  	lea	16(up), up
   151  	js	L(lm0)
   152  
   153  	psrlq	$32, %mm6
   154  	movd	%mm6, 16(rp)
   155  
   156  	inc	n
   157  C	jz	L(done)
   158    lea	-8(up), up
   159    lea	8(rp), rp
   160  	jmp	L(ol3)
   161  
   162  C ================================================================
   163  	ALIGN(16)
   164  L(lm1):	movd	-12(up), %mm0
   165  	pmuludq	%mm7, %mm0
   166  	psrlq	$32, %mm6
   167  	lea	16(rp), rp
   168  	paddq	%mm0, %mm6
   169  	movd	-8(up), %mm0
   170  	pmuludq	%mm7, %mm0
   171  	movd	%mm6, -12(rp)
   172  	psrlq	$32, %mm6
   173  	paddq	%mm0, %mm6
   174  	movd	-4(up), %mm0
   175  	pmuludq	%mm7, %mm0
   176  	movd	%mm6, -8(rp)
   177  	psrlq	$32, %mm6
   178  	paddq	%mm0, %mm6
   179  	movd	(up), %mm0
   180  	pmuludq	%mm7, %mm0
   181  	movd	%mm6, -4(rp)
   182  	psrlq	$32, %mm6
   183  L(of1):	paddq	%mm0, %mm6
   184  	add	$4, un
   185  	movd	%mm6, (rp)
   186  	lea	16(up), up
   187  	js	L(lm1)
   188  
   189  	psrlq	$32, %mm6
   190  	movd	%mm6, 4(rp)
   191  
   192  	inc	n
   193  	jz	L(done)		C goes away when we add special n=2 code
   194    lea	-20(up), up
   195    lea	-4(rp), rp
   196  	jmp	L(ol0)
   197  
   198  C ================================================================
   199  	ALIGN(16)
   200  L(lm2):	movd	-8(up), %mm0
   201  	pmuludq	%mm7, %mm0
   202  	psrlq	$32, %mm6
   203  	lea	16(rp), rp
   204  	paddq	%mm0, %mm6
   205  	movd	-4(up), %mm0
   206  	pmuludq	%mm7, %mm0
   207  	movd	%mm6, -8(rp)
   208  	psrlq	$32, %mm6
   209  	paddq	%mm0, %mm6
   210  	movd	(up), %mm0
   211  	pmuludq	%mm7, %mm0
   212  	movd	%mm6, -4(rp)
   213  	psrlq	$32, %mm6
   214  L(of2):	paddq	%mm0, %mm6
   215  	movd	4(up), %mm0
   216  	pmuludq	%mm7, %mm0
   217  	movd	%mm6, (rp)
   218  	psrlq	$32, %mm6
   219  	paddq	%mm0, %mm6
   220  	add	$4, un
   221  	movd	%mm6, 4(rp)
   222  	lea	16(up), up
   223  	js	L(lm2)
   224  
   225  	psrlq	$32, %mm6
   226  	movd	%mm6, 8(rp)
   227  
   228  	inc	n
   229  C	jz	L(done)
   230    lea	-16(up), up
   231  C  lea	(rp), rp
   232  C	jmp	L(ol1)
   233  
   234  C ================================================================
   235  
   236  L(ol1):	lea	4(up,n,4), up
   237  	movd	(up), %mm7	C read next U invariant limb
   238  	lea	8(rp,n,4), rp
   239  	mov	n, un
   240  
   241  	movd	4(up), %mm1
   242  	pmuludq	%mm7, %mm1
   243  	sar	$2, un
   244  	movd	%mm1, %ebx
   245  	inc	un
   246  	jz	L(re1)
   247  
   248  	movd	8(up), %mm0
   249  	pmuludq	%mm7, %mm0
   250  	xor	%edx, %edx	C zero edx and CF
   251  	jmp	L(a1)
   252  
   253  L(la1):	adc	$0, %edx
   254  	add	%ebx, 12(rp)
   255  	movd	%mm0, %eax
   256  	pmuludq	%mm7, %mm1
   257  	lea	16(rp), rp
   258  	psrlq	$32, %mm0
   259  	adc	%edx, %eax
   260  	movd	%mm0, %edx
   261  	movd	%mm1, %ebx
   262  	movd	8(up), %mm0
   263  	pmuludq	%mm7, %mm0
   264  	adc	$0, %edx
   265  	add	%eax, (rp)
   266  L(a1):	psrlq	$32, %mm1
   267  	adc	%edx, %ebx
   268  	movd	%mm1, %edx
   269  	movd	%mm0, %eax
   270  	movd	12(up), %mm1
   271  	pmuludq	%mm7, %mm1
   272  	adc	$0, %edx
   273  	add	%ebx, 4(rp)
   274  	psrlq	$32, %mm0
   275  	adc	%edx, %eax
   276  	movd	%mm0, %edx
   277  	movd	%mm1, %ebx
   278  	lea	16(up), up
   279  	movd	(up), %mm0
   280  	adc	$0, %edx
   281  	add	%eax, 8(rp)
   282  	psrlq	$32, %mm1
   283  	adc	%edx, %ebx
   284  	movd	%mm1, %edx
   285  	pmuludq	%mm7, %mm0
   286  	inc	un
   287  	movd	4(up), %mm1
   288  	jnz	L(la1)
   289  
   290  	adc	un, %edx	C un is zero here
   291  	add	%ebx, 12(rp)
   292  	movd	%mm0, %eax
   293  	pmuludq	%mm7, %mm1
   294  	lea	16(rp), rp
   295  	psrlq	$32, %mm0
   296  	adc	%edx, %eax
   297  	movd	%mm0, %edx
   298  	movd	%mm1, %ebx
   299  	adc	un, %edx
   300  	add	%eax, (rp)
   301  	psrlq	$32, %mm1
   302  	adc	%edx, %ebx
   303  	movd	%mm1, %eax
   304  	adc	un, %eax
   305  	add	%ebx, 4(rp)
   306  	adc	un, %eax
   307  	mov	%eax, 8(rp)
   308  
   309  	inc	n
   310  
   311  C ================================================================
   312  
   313  L(ol0):	lea	(up,n,4), up
   314  	movd	4(up), %mm7	C read next U invariant limb
   315  	lea	4(rp,n,4), rp
   316  	mov	n, un
   317  
   318  	movd	8(up), %mm0
   319  	pmuludq	%mm7, %mm0
   320  	sar	$2, un
   321  	movd	12(up), %mm1
   322  	movd	%mm0, %eax
   323  	pmuludq	%mm7, %mm1
   324  	xor	%edx, %edx	C zero edx and CF
   325  	jmp	L(a0)
   326  
   327  L(la0):	adc	$0, %edx
   328  	add	%ebx, 12(rp)
   329  	movd	%mm0, %eax
   330  	pmuludq	%mm7, %mm1
   331  	lea	16(rp), rp
   332  	psrlq	$32, %mm0
   333  	adc	%edx, %eax
   334  	movd	%mm0, %edx
   335  	movd	%mm1, %ebx
   336  	movd	8(up), %mm0
   337  	pmuludq	%mm7, %mm0
   338  	adc	$0, %edx
   339  	add	%eax, (rp)
   340  	psrlq	$32, %mm1
   341  	adc	%edx, %ebx
   342  	movd	%mm1, %edx
   343  	movd	%mm0, %eax
   344  	movd	12(up), %mm1
   345  	pmuludq	%mm7, %mm1
   346  	adc	$0, %edx
   347  	add	%ebx, 4(rp)
   348  L(a0):	psrlq	$32, %mm0
   349  	adc	%edx, %eax
   350  	movd	%mm0, %edx
   351  	movd	%mm1, %ebx
   352  	lea	16(up), up
   353  	movd	(up), %mm0
   354  	adc	$0, %edx
   355  	add	%eax, 8(rp)
   356  	psrlq	$32, %mm1
   357  	adc	%edx, %ebx
   358  	movd	%mm1, %edx
   359  	pmuludq	%mm7, %mm0
   360  	inc	un
   361  	movd	4(up), %mm1
   362  	jnz	L(la0)
   363  
   364  	adc	un, %edx	C un is zero here
   365  	add	%ebx, 12(rp)
   366  	movd	%mm0, %eax
   367  	pmuludq	%mm7, %mm1
   368  	lea	16(rp), rp
   369  	psrlq	$32, %mm0
   370  	adc	%edx, %eax
   371  	movd	%mm0, %edx
   372  	movd	%mm1, %ebx
   373  	adc	un, %edx
   374  	add	%eax, (rp)
   375  	psrlq	$32, %mm1
   376  	adc	%edx, %ebx
   377  	movd	%mm1, %eax
   378  	adc	un, %eax
   379  	add	%ebx, 4(rp)
   380  	adc	un, %eax
   381  	mov	%eax, 8(rp)
   382  
   383  	inc	n
   384  
   385  C ================================================================
   386  
   387  L(ol3):	lea	12(up,n,4), up
   388  	movd	-8(up), %mm7	C read next U invariant limb
   389  	lea	(rp,n,4), rp	C put rp back
   390  	mov	n, un
   391  
   392  	movd	-4(up), %mm1
   393  	pmuludq	%mm7, %mm1
   394  	sar	$2, un
   395  	movd	%mm1, %ebx
   396  	movd	(up), %mm0
   397  	xor	%edx, %edx	C zero edx and CF
   398  	jmp	L(a3)
   399  
   400  L(la3):	adc	$0, %edx
   401  	add	%ebx, 12(rp)
   402  	movd	%mm0, %eax
   403  	pmuludq	%mm7, %mm1
   404  	lea	16(rp), rp
   405  	psrlq	$32, %mm0
   406  	adc	%edx, %eax
   407  	movd	%mm0, %edx
   408  	movd	%mm1, %ebx
   409  	movd	8(up), %mm0
   410  	pmuludq	%mm7, %mm0
   411  	adc	$0, %edx
   412  	add	%eax, (rp)
   413  	psrlq	$32, %mm1
   414  	adc	%edx, %ebx
   415  	movd	%mm1, %edx
   416  	movd	%mm0, %eax
   417  	movd	12(up), %mm1
   418  	pmuludq	%mm7, %mm1
   419  	adc	$0, %edx
   420  	add	%ebx, 4(rp)
   421  	psrlq	$32, %mm0
   422  	adc	%edx, %eax
   423  	movd	%mm0, %edx
   424  	movd	%mm1, %ebx
   425  	lea	16(up), up
   426  	movd	(up), %mm0
   427  	adc	$0, %edx
   428  	add	%eax, 8(rp)
   429  L(a3):	psrlq	$32, %mm1
   430  	adc	%edx, %ebx
   431  	movd	%mm1, %edx
   432  	pmuludq	%mm7, %mm0
   433  	inc	un
   434  	movd	4(up), %mm1
   435  	jnz	L(la3)
   436  
   437  	adc	un, %edx	C un is zero here
   438  	add	%ebx, 12(rp)
   439  	movd	%mm0, %eax
   440  	pmuludq	%mm7, %mm1
   441  	lea	16(rp), rp
   442  	psrlq	$32, %mm0
   443  	adc	%edx, %eax
   444  	movd	%mm0, %edx
   445  	movd	%mm1, %ebx
   446  	adc	un, %edx
   447  	add	%eax, (rp)
   448  	psrlq	$32, %mm1
   449  	adc	%edx, %ebx
   450  	movd	%mm1, %eax
   451  	adc	un, %eax
   452  	add	%ebx, 4(rp)
   453  	adc	un, %eax
   454  	mov	%eax, 8(rp)
   455  
   456  	inc	n
   457  
   458  C ================================================================
   459  
   460  L(ol2):	lea	8(up,n,4), up
   461  	movd	-4(up), %mm7	C read next U invariant limb
   462  	lea	12(rp,n,4), rp
   463  	mov	n, un
   464  
   465  	movd	(up), %mm0
   466  	pmuludq	%mm7, %mm0
   467  	xor	%edx, %edx
   468  	sar	$2, un
   469  	movd	4(up), %mm1
   470  	test	un, un		C clear carry
   471  	movd	%mm0, %eax
   472  	pmuludq	%mm7, %mm1
   473  	inc	un
   474  	jnz	L(a2)
   475  	jmp	L(re2)
   476  
   477  L(la2):	adc	$0, %edx
   478  	add	%ebx, 12(rp)
   479  	movd	%mm0, %eax
   480  	pmuludq	%mm7, %mm1
   481  	lea	16(rp), rp
   482  L(a2):	psrlq	$32, %mm0
   483  	adc	%edx, %eax
   484  	movd	%mm0, %edx
   485  	movd	%mm1, %ebx
   486  	movd	8(up), %mm0
   487  	pmuludq	%mm7, %mm0
   488  	adc	$0, %edx
   489  	add	%eax, (rp)
   490  	psrlq	$32, %mm1
   491  	adc	%edx, %ebx
   492  	movd	%mm1, %edx
   493  	movd	%mm0, %eax
   494  	movd	12(up), %mm1
   495  	pmuludq	%mm7, %mm1
   496  	adc	$0, %edx
   497  	add	%ebx, 4(rp)
   498  	psrlq	$32, %mm0
   499  	adc	%edx, %eax
   500  	movd	%mm0, %edx
   501  	movd	%mm1, %ebx
   502  	lea	16(up), up
   503  	movd	(up), %mm0
   504  	adc	$0, %edx
   505  	add	%eax, 8(rp)
   506  	psrlq	$32, %mm1
   507  	adc	%edx, %ebx
   508  	movd	%mm1, %edx
   509  	pmuludq	%mm7, %mm0
   510  	inc	un
   511  	movd	4(up), %mm1
   512  	jnz	L(la2)
   513  
   514  	adc	un, %edx	C un is zero here
   515  	add	%ebx, 12(rp)
   516  	movd	%mm0, %eax
   517  	pmuludq	%mm7, %mm1
   518  	lea	16(rp), rp
   519  	psrlq	$32, %mm0
   520  	adc	%edx, %eax
   521  	movd	%mm0, %edx
   522  	movd	%mm1, %ebx
   523  	adc	un, %edx
   524  	add	%eax, (rp)
   525  	psrlq	$32, %mm1
   526  	adc	%edx, %ebx
   527  	movd	%mm1, %eax
   528  	adc	un, %eax
   529  	add	%ebx, 4(rp)
   530  	adc	un, %eax
   531  	mov	%eax, 8(rp)
   532  
   533  	inc	n
   534  	jmp	L(ol1)
   535  
   536  C ================================================================
   537  L(re2):	psrlq	$32, %mm0
   538  	movd	(up), %mm7	C read next U invariant limb
   539  	adc	%edx, %eax
   540  	movd	%mm0, %edx
   541  	movd	%mm1, %ebx
   542  	adc	un, %edx
   543  	add	%eax, (rp)
   544  	lea	4(rp), rp
   545  	psrlq	$32, %mm1
   546  	adc	%edx, %ebx
   547  	movd	%mm1, %eax
   548  	movd	4(up), %mm1
   549  	adc	un, %eax
   550  	add	%ebx, (rp)
   551  	pmuludq	%mm7, %mm1
   552  	adc	un, %eax
   553  	mov	%eax, 4(rp)
   554  	movd	%mm1, %ebx
   555  
   556  L(re1):	psrlq	$32, %mm1
   557  	add	%ebx, 4(rp)
   558  	movd	%mm1, %eax
   559  	adc	un, %eax
   560  	xor	n, n		C make n zeroness assumption below true
   561  	mov	%eax, 8(rp)
   562  
   563  L(done):			C n is zero here
   564  	mov	24(%esp), up
   565  	mov	28(%esp), %eax
   566  
   567  	movd	(up), %mm0
   568  	inc	%eax
   569  	pmuludq	%mm0, %mm0
   570  	lea	4(up), up
   571  	mov	20(%esp), rp
   572  	shr	%eax
   573  	movd	%mm0, (rp)
   574  	psrlq	$32, %mm0
   575  	lea	-12(rp), rp
   576  	mov	%eax, 28(%esp)
   577  	jnc	L(odd)
   578  
   579  	movd	%mm0, %ebp
   580  	movd	(up), %mm0
   581  	lea	8(rp), rp
   582  	pmuludq	%mm0, %mm0
   583  	lea	-4(up), up
   584  	add	8(rp), %ebp
   585  	movd	%mm0, %edx
   586  	adc	12(rp), %edx
   587  	rcr	n
   588  	jmp	L(ent)
   589  
   590  C	ALIGN(16)		C alignment seems irrelevant
   591  L(top):	movd	(up), %mm1
   592  	adc	n, n
   593  	movd	%mm0, %eax
   594  	pmuludq	%mm1, %mm1
   595  	movd	4(up), %mm0
   596  	adc	(rp), %eax
   597  	movd	%mm1, %ebx
   598  	pmuludq	%mm0, %mm0
   599  	psrlq	$32, %mm1
   600  	adc	4(rp), %ebx
   601  	movd	%mm1, %ebp
   602  	movd	%mm0, %edx
   603  	adc	8(rp), %ebp
   604  	adc	12(rp), %edx
   605  	rcr	n		C FIXME: isn't this awfully slow on atom???
   606  	adc	%eax, (rp)
   607  	adc	%ebx, 4(rp)
   608  L(ent):	lea	8(up), up
   609  	adc	%ebp, 8(rp)
   610  	psrlq	$32, %mm0
   611  	adc	%edx, 12(rp)
   612  L(odd):	decl	28(%esp)
   613  	lea	16(rp), rp
   614  	jnz	L(top)
   615  
   616  L(end):	adc	n, n
   617  	movd	%mm0, %eax
   618  	adc	n, %eax
   619  	mov	%eax, (rp)
   620  
   621  L(rtn):	emms
   622  	pop	%ebp
   623  	pop	%ebx
   624  	pop	%esi
   625  	pop	%edi
   626  	ret
   627  
   628  L(one):	pmuludq	%mm7, %mm7
   629  	movq	%mm7, -4(rp)
   630  	emms
   631  	pop	%esi
   632  	pop	%edi
   633  	ret
   634  EPILOGUE()