github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/bobcat/sqr_basecase.asm (about)

     1  dnl  AMD64 mpn_sqr_basecase optimised for AMD bobcat.
     2  
     3  dnl  Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  C	     cycles/limb
    34  C AMD K8,K9	 4.5
    35  C AMD K10	 4.5
    36  C AMD bd1	 4.75
    37  C AMD bobcat	 5
    38  C Intel P4	17.7
    39  C Intel core2	 5.5
    40  C Intel NHM	 5.43
    41  C Intel SBR	 3.92
    42  C Intel atom	23
    43  C VIA nano	 5.63
    44  
    45  C This sqr_basecase is based on mul_1 and addmul_1, since these both run at the
    46  C multiply insn bandwidth, without any apparent loop branch exit pipeline
    47  C replays experienced on K8.  The structure is unusual: it falls into mul_1 in
    48  C the same way for all n, then it splits into 4 different wind-down blocks and
    49  C 4 separate addmul_1 loops.
    50  C
    51  C We have not tried using the same addmul_1 loops with a switch into feed-in
    52  C code, as we do in other basecase implementations.  Doing that could save
    53  C substantial code volume, but would also probably add some overhead.
    54  
    55  C TODO
    56  C  * Tune un < 4 code.
    57  C  * Perhaps implement a larger final corner (it is now 2 x 1).
    58  C  * Lots of space could be saved by replacing the "switch" code by gradual
    59  C    jumps out from mul_1 winddown code, perhaps with no added overhead.
    60  C  * Are the ALIGN(16) really necessary?  They add about 25 bytes of padding.
    61  
    62  ABI_SUPPORT(DOS64)
    63  ABI_SUPPORT(STD64)
    64  
    65  C Standard parameters
    66  define(`rp',              `%rdi')
    67  define(`up',              `%rsi')
    68  define(`un_param',        `%rdx')
    69  C Standard allocations
    70  define(`un',              `%rbx')
    71  define(`w0',              `%r8')
    72  define(`w1',              `%r9')
    73  define(`w2',              `%r10')
    74  define(`w3',              `%r11')
    75  define(`n',               `%rbp')
    76  define(`v0',              `%rcx')
    77  
    78  C Temp macro for allowing control over indexing.
    79  C Define to return $1 for more conservative ptr handling.
    80  define(`X',`$2')
    81  dnl define(`X',`$1')
    82  
    83  
    84  ASM_START()
    85  	TEXT
    86  	ALIGN(64)
    87  PROLOGUE(mpn_sqr_basecase)
    88  	FUNC_ENTRY(3)
    89  
    90  	mov	(up), %rax
    91  
    92  	cmp	$2, R32(un_param)
    93  	jae	L(ge2)
    94  
    95  	mul	%rax
    96  	mov	%rax, (rp)
    97  	mov	%rdx, 8(rp)
    98  	FUNC_EXIT()
    99  	ret
   100  
   101  L(ge2):	mov	(up), v0
   102  	jnz	L(g2)
   103  
   104  	mul	%rax
   105  	mov	%rax, (rp)
   106  	mov	8(up), %rax
   107  	mov	%rdx, w0
   108  	mul	v0
   109  	add	%rax, w0
   110  	mov	%rdx, w1
   111  	adc	$0, w1
   112  	mov	8(up), v0
   113  	mov	(up), %rax
   114  	mul	v0
   115  	add	%rax, w0
   116  	mov	w0, 8(rp)
   117  	mov	%rdx, w0		C CAUTION: r8 realloc
   118  	adc	$0, w0
   119  	mov	8(up), %rax
   120  	mul	v0
   121  	add	w1, w0
   122  	adc	$0, %rdx
   123  	add	w0, %rax
   124  	adc	$0, %rdx
   125  	mov	%rax, 16(rp)
   126  	mov	%rdx, 24(rp)
   127  	FUNC_EXIT()
   128  	ret
   129  
   130  L(g2):	cmp	$3, R32(un_param)
   131  	ja	L(g3)
   132  	mul	%rax
   133  	mov	%rax, (rp)
   134  	mov	%rdx, 8(rp)
   135  	mov	8(up), %rax
   136  	mul	%rax
   137  	mov	%rax, 16(rp)
   138  	mov	%rdx, 24(rp)
   139  	mov	16(up), %rax
   140  	mul	%rax
   141  	mov	%rax, 32(rp)
   142  	mov	%rdx, 40(rp)
   143  
   144  	mov	(up), v0
   145  	mov	8(up), %rax
   146  	mul	v0
   147  	mov	%rax, w0
   148  	mov	%rdx, w1
   149  	mov	16(up), %rax
   150  	mul	v0
   151  	xor	R32(w2), R32(w2)
   152  	add	%rax, w1
   153  	adc	%rdx, w2
   154  
   155  	mov	8(up), v0
   156  	mov	16(up), %rax
   157  	mul	v0
   158  	xor	R32(w3), R32(w3)
   159  	add	%rax, w2
   160  	adc	%rdx, w3
   161  	add	w0, w0
   162  	adc	w1, w1
   163  	adc	w2, w2
   164  	adc	w3, w3
   165  	mov	$0, R32(v0)
   166  	adc	v0, v0
   167  	add	w0, 8(rp)
   168  	adc	w1, 16(rp)
   169  	adc	w2, 24(rp)
   170  	adc	w3, 32(rp)
   171  	adc	v0, 40(rp)
   172  	FUNC_EXIT()
   173  	ret
   174  
   175  L(g3):	push	%rbx
   176  	push	%rbp
   177  
   178  	mov	8(up), %rax
   179  	lea	-24(rp,un_param,8), rp
   180  	lea	-24(up,un_param,8), up
   181  	neg	un_param
   182  	push	un_param		C for sqr_diag_addlsh1
   183  	lea	(un_param), un
   184  	lea	3(un_param), n
   185  
   186  	mul	v0
   187  	mov	%rax, w2
   188  	mov	%rdx, w3
   189  	jmp	L(L3)
   190  
   191  	ALIGN(16)
   192  L(top):	mov	w0, -16(rp,n,8)
   193  	add	w1, w2
   194  	adc	$0, w3
   195  	mov	(up,n,8), %rax
   196  	mul	v0
   197  	mov	%rax, w0
   198  	mov	%rdx, w1
   199  	mov	w2, -8(rp,n,8)
   200  	add	w3, w0
   201  	adc	$0, w1
   202  	mov	8(up,n,8), %rax
   203  	mul	v0
   204  	mov	%rax, w2
   205  	mov	%rdx, w3
   206  	mov	w0, (rp,n,8)
   207  	add	w1, w2
   208  	adc	$0, w3
   209  L(L3):	mov	16(up,n,8), %rax
   210  	mul	v0
   211  	mov	%rax, w0
   212  	mov	%rdx, w1
   213  	mov	w2, 8(rp,n,8)
   214  	add	w3, w0
   215  	adc	$0, w1
   216  	mov	24(up,n,8), %rax
   217  	mul	v0
   218  	mov	%rax, w2
   219  	mov	%rdx, w3
   220  	add	$4, n
   221  	js	L(top)
   222  
   223  	mov	w0, -16(rp,n,8)
   224  	add	w1, w2
   225  	adc	$0, w3
   226  
   227  	test	n, n
   228  	jz	L(r2)
   229  	cmp	$2, R32(n)
   230  	ja	L(r3)
   231  	jz	L(r0)
   232  
   233  
   234  L(r1):	mov	X((up,n,8),8(up)), %rax
   235  	mul	v0
   236  	mov	%rax, w0
   237  	mov	%rdx, w1
   238  	mov	w2, X(-8(rp,n,8),(rp))
   239  	add	w3, w0
   240  	adc	$0, w1
   241  	mov	X(8(up,n,8),16(up)), %rax
   242  	mul	v0
   243  	mov	%rax, w2
   244  	mov	%rdx, w3
   245  	mov	w0, X((rp,n,8),8(rp))
   246  	add	w1, w2
   247  	adc	$0, w3
   248  	mov	w2, X(8(rp,n,8),16(rp))
   249  	mov	w3, X(16(rp,n,8),24(rp))
   250  	add	$5, un
   251  	jmp	L(to0)
   252  
   253  L(r2):	mov	X((up,n,8),(up)), %rax
   254  	mul	v0
   255  	mov	%rax, w0
   256  	mov	%rdx, w1
   257  	mov	w2, X(-8(rp,n,8),-8(rp))
   258  	add	w3, w0
   259  	adc	$0, w1
   260  	mov	X(8(up,n,8),8(up)), %rax
   261  	mul	v0
   262  	mov	%rax, w2
   263  	mov	%rdx, w3
   264  	mov	w0, X((rp,n,8),(rp))
   265  	add	w1, w2
   266  	adc	$0, w3
   267  	mov	X(16(up,n,8),16(up)), %rax
   268  	mul	v0
   269  	mov	%rax, w0
   270  	mov	%rdx, w1
   271  	mov	w2, X(8(rp,n,8),8(rp))
   272  	add	w3, w0
   273  	adc	$0, w1
   274  	mov	w0, X(16(rp,n,8),16(rp))
   275  	adc	$0, w3
   276  	mov	w1, X(24(rp,n,8),24(rp))
   277  	add	$6, un
   278  	jmp	L(to1)
   279  
   280  L(r3):	mov	w2, X(-8(rp,n,8),16(rp))
   281  	mov	w3, X((rp,n,8),24(rp))
   282  	add	$3, un
   283  	jmp	L(to2)
   284  
   285  L(r0):	mov	X((up,n,8),16(up)), %rax
   286  	mul	v0
   287  	mov	%rax, w0
   288  	mov	%rdx, w1
   289  	mov	w2, X(-8(rp,n,8),8(rp))
   290  	add	w3, w0
   291  	adc	$0, w1
   292  	mov	w0, X((rp,n,8),16(rp))
   293  	mov	w1, X(8(rp,n,8),24(rp))
   294  	add	$4, un
   295  C	jmp	L(to3)
   296  C fall through into main loop
   297  
   298  
   299  L(outer):
   300  	mov	un, n
   301  	mov	(up,un,8), v0
   302  	mov	8(up,un,8), %rax
   303  	lea	8(rp), rp
   304  	mul	v0
   305  	mov	%rax, w2
   306  	mov	%rdx, w3
   307  	jmp	L(al3)
   308  
   309  	ALIGN(16)
   310  L(ta3):	add	w0, -16(rp,n,8)
   311  	adc	w1, w2
   312  	adc	$0, w3
   313  	mov	(up,n,8), %rax
   314  	mul	v0
   315  	mov	%rax, w0
   316  	mov	%rdx, w1
   317  	add	w2, -8(rp,n,8)
   318  	adc	w3, w0
   319  	adc	$0, w1
   320  	mov	8(up,n,8), %rax
   321  	mul	v0
   322  	mov	%rax, w2
   323  	mov	%rdx, w3
   324  	add	w0, (rp,n,8)
   325  	adc	w1, w2
   326  	adc	$0, w3
   327  L(al3):	mov	16(up,n,8), %rax
   328  	mul	v0
   329  	mov	%rax, w0
   330  	mov	%rdx, w1
   331  	add	w2, 8(rp,n,8)
   332  	adc	w3, w0
   333  	adc	$0, w1
   334  	mov	24(up,n,8), %rax
   335  	mul	v0
   336  	mov	%rax, w2
   337  	mov	%rdx, w3
   338  	add	$4, n
   339  	js	L(ta3)
   340  
   341  	add	w0, X(-16(rp,n,8),8(rp))
   342  	adc	w1, w2
   343  	adc	$0, w3
   344  	add	w2, X(-8(rp,n,8),16(rp))
   345  	adc	$0, w3
   346  	mov	w3, X((rp,n,8),24(rp))
   347  
   348  
   349  L(to2):	mov	un, n
   350  	cmp	$-4, R32(un)
   351  	jnc	L(end)
   352  	add	$4, un
   353  	mov	8(up,n,8), v0
   354  	mov	16(up,n,8), %rax
   355  	lea	8(rp), rp
   356  	mul	v0
   357  	mov	%rax, w0
   358  	mov	%rdx, w1
   359  	jmp	L(al2)
   360  
   361  	ALIGN(16)
   362  L(ta2):	add	w0, -16(rp,n,8)
   363  	adc	w1, w2
   364  	adc	$0, w3
   365  	mov	(up,n,8), %rax
   366  	mul	v0
   367  	mov	%rax, w0
   368  	mov	%rdx, w1
   369  	add	w2, -8(rp,n,8)
   370  	adc	w3, w0
   371  	adc	$0, w1
   372  	mov	8(up,n,8), %rax
   373  	mul	v0
   374  	mov	%rax, w2
   375  	mov	%rdx, w3
   376  	add	w0, (rp,n,8)
   377  	adc	w1, w2
   378  	adc	$0, w3
   379  	mov	16(up,n,8), %rax
   380  	mul	v0
   381  	mov	%rax, w0
   382  	mov	%rdx, w1
   383  	add	w2, 8(rp,n,8)
   384  	adc	w3, w0
   385  	adc	$0, w1
   386  L(al2):	mov	24(up,n,8), %rax
   387  	mul	v0
   388  	mov	%rax, w2
   389  	mov	%rdx, w3
   390  	add	$4, n
   391  	js	L(ta2)
   392  
   393  	add	w0, X(-16(rp,n,8),8(rp))
   394  	adc	w1, w2
   395  	adc	$0, w3
   396  	add	w2, X(-8(rp,n,8),16(rp))
   397  	adc	$0, w3
   398  	mov	w3, X((rp,n,8),24(rp))
   399  
   400  
   401  L(to1):	mov	un, n
   402  	mov	-16(up,un,8), v0
   403  	mov	-8(up,un,8), %rax
   404  	lea	8(rp), rp
   405  	mul	v0
   406  	mov	%rax, w2
   407  	mov	%rdx, w3
   408  	jmp	L(al1)
   409  
   410  	ALIGN(16)
   411  L(ta1):	add	w0, -16(rp,n,8)
   412  	adc	w1, w2
   413  	adc	$0, w3
   414  L(al1):	mov	(up,n,8), %rax
   415  	mul	v0
   416  	mov	%rax, w0
   417  	mov	%rdx, w1
   418  	add	w2, -8(rp,n,8)
   419  	adc	w3, w0
   420  	adc	$0, w1
   421  	mov	8(up,n,8), %rax
   422  	mul	v0
   423  	mov	%rax, w2
   424  	mov	%rdx, w3
   425  	add	w0, (rp,n,8)
   426  	adc	w1, w2
   427  	adc	$0, w3
   428  	mov	16(up,n,8), %rax
   429  	mul	v0
   430  	mov	%rax, w0
   431  	mov	%rdx, w1
   432  	add	w2, 8(rp,n,8)
   433  	adc	w3, w0
   434  	adc	$0, w1
   435  	mov	24(up,n,8), %rax
   436  	mul	v0
   437  	mov	%rax, w2
   438  	mov	%rdx, w3
   439  	add	$4, n
   440  	js	L(ta1)
   441  
   442  	add	w0, X(-16(rp,n,8),8(rp))
   443  	adc	w1, w2
   444  	adc	$0, w3
   445  	add	w2, X(-8(rp,n,8),16(rp))
   446  	adc	$0, w3
   447  	mov	w3, X((rp,n,8),24(rp))
   448  
   449  
   450  L(to0):	mov	un, n
   451  	mov	-8(up,un,8), v0
   452  	mov	(up,un,8), %rax
   453  	lea	8(rp), rp
   454  	mul	v0
   455  	mov	%rax, w0
   456  	mov	%rdx, w1
   457  	jmp	L(al0)
   458  
   459  	ALIGN(16)
   460  L(ta0):	add	w0, -16(rp,n,8)
   461  	adc	w1, w2
   462  	adc	$0, w3
   463  	mov	(up,n,8), %rax
   464  	mul	v0
   465  	mov	%rax, w0
   466  	mov	%rdx, w1
   467  	add	w2, -8(rp,n,8)
   468  	adc	w3, w0
   469  	adc	$0, w1
   470  L(al0):	mov	8(up,n,8), %rax
   471  	mul	v0
   472  	mov	%rax, w2
   473  	mov	%rdx, w3
   474  	add	w0, (rp,n,8)
   475  	adc	w1, w2
   476  	adc	$0, w3
   477  	mov	16(up,n,8), %rax
   478  	mul	v0
   479  	mov	%rax, w0
   480  	mov	%rdx, w1
   481  	add	w2, 8(rp,n,8)
   482  	adc	w3, w0
   483  	adc	$0, w1
   484  	mov	24(up,n,8), %rax
   485  	mul	v0
   486  	mov	%rax, w2
   487  	mov	%rdx, w3
   488  	add	$4, n
   489  	js	L(ta0)
   490  
   491  	add	w0, X(-16(rp,n,8),8(rp))
   492  	adc	w1, w2
   493  	adc	$0, w3
   494  	add	w2, X(-8(rp,n,8),16(rp))
   495  	adc	$0, w3
   496  	mov	w3, X((rp,n,8),24(rp))
   497  	jmp	L(outer)
   498  
   499  
   500  L(end):	mov	X(8(up,un,8),(up)), v0
   501  	mov	X(16(up,un,8),8(up)), %rax
   502  	mul	v0
   503  	mov	%rax, w0
   504  	mov	%rdx, w1
   505  	mov	X(24(up,un,8),16(up)), %rax
   506  	mul	v0
   507  	mov	%rax, w2
   508  	mov	%rdx, w3
   509  	add	w0, X(24(rp,un,8),16(rp))
   510  	adc	w1, w2
   511  	adc	$0, w3
   512  	add	w2, X(32(rp,un,8),24(rp))
   513  	adc	$0, w3
   514  	mov	X(16(up,un,8),8(up)), v0
   515  	mov	X(24(up,un,8),16(up)), %rax
   516  	mul	v0
   517  	add	%rax, w3
   518  	mov	w3, X(40(rp,un,8),32(rp))
   519  	adc	$0, %rdx
   520  	mov	%rdx, X(48(rp,un,8),40(rp))
   521  
   522  
   523  C sqr_diag_addlsh1
   524  
   525  	lea	16(up), up
   526  	lea	40(rp), rp
   527  	pop	n
   528  	lea	2(n,n), n
   529  
   530  	mov	(up,n,4), %rax
   531  	mul	%rax
   532  	xor	R32(w2), R32(w2)
   533  
   534  	mov	8(rp,n,8), w0
   535  	mov	%rax, (rp,n,8)
   536  	jmp	L(lm)
   537  
   538  	ALIGN(8)
   539  L(tsd):	add	%rbx, w0
   540  	adc	%rax, w1
   541  	mov	w0, -8(rp,n,8)
   542  	mov	8(rp,n,8), w0
   543  	mov	w1, (rp,n,8)
   544  L(lm):	mov	16(rp,n,8), w1
   545  	adc	w0, w0
   546  	adc	w1, w1
   547  	lea	(%rdx,w2), %rbx
   548  	mov	8(up,n,4), %rax
   549  	setc	R8(w2)
   550  	mul	%rax
   551  	add	$2, n
   552  	js	L(tsd)
   553  
   554  L(esd):	add	%rbx, w0
   555  	adc	%rax, w1
   556  	mov	w0, X(-8(rp,n,8),-8(rp))
   557  	mov	w1, X((rp,n,8),(rp))
   558  	adc	w2, %rdx
   559  	mov	%rdx, X(8(rp,n,8),8(rp))
   560  
   561  	pop	%rbp
   562  	pop	%rbx
   563  	FUNC_EXIT()
   564  	ret
   565  EPILOGUE()