github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/bobcat/mul_basecase.asm (about)

     1  dnl  AMD64 mpn_mul_basecase optimised for AMD bobcat.
     2  
     3  dnl  Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  C	     cycles/limb
    34  C AMD K8,K9	 4.5
    35  C AMD K10	 4.5
    36  C AMD bd1	 4.75
    37  C AMD bobcat	 5
    38  C Intel P4	17.7
    39  C Intel core2	 5.5
    40  C Intel NHM	 5.43
    41  C Intel SBR	 3.92
    42  C Intel atom	23
    43  C VIA nano	 5.63
    44  
    45  C This mul_basecase is based on mul_1 and addmul_1, since these both run at the
    46  C multiply insn bandwidth, without any apparent loop branch exit pipeline
    47  C replays experienced on K8.  The structure is unusual: it falls into mul_1 in
    48  C the same way for all n, then it splits into 4 different wind-down blocks and
    49  C 4 separate addmul_1 loops.
    50  C
    51  C We have not tried using the same addmul_1 loops with a switch into feed-in
    52  C code, as we do in other basecase implementations.  Doing that could save
    53  C substantial code volume, but would also probably add some overhead.
    54  
    55  C TODO
    56  C  * Tune un < 3 code.
    57  C  * Fix slowdown for un=vn=3 (67->71) compared to default code.
    58  C  * This is 1263 bytes, compared to 1099 bytes for default code.  Consider
    59  C    combining addmul loops like that code.  Tolerable slowdown?
    60  C  * Lots of space could be saved by replacing the "switch" code by gradual
    61  C    jumps out from mul_1 winddown code, perhaps with no added overhead.
    62  C  * Are the ALIGN(16) really necessary?  They add about 25 bytes of padding.
    63  
    64  ABI_SUPPORT(DOS64)
    65  ABI_SUPPORT(STD64)
    66  
    67  C Standard parameters
    68  define(`rp',              `%rdi')
    69  define(`up',              `%rsi')
    70  define(`un_param',        `%rdx')
    71  define(`vp',              `%rcx')
    72  define(`vn',              `%r8')
    73  C Standard allocations
    74  define(`un',              `%rbx')
    75  define(`w0',              `%r10')
    76  define(`w1',              `%r11')
    77  define(`w2',              `%r12')
    78  define(`w3',              `%r13')
    79  define(`n',               `%rbp')
    80  define(`v0',              `%r9')
    81  
    82  C Temp macro for allowing control over indexing.
    83  C Define to return $1 for more conservative ptr handling.
    84  define(`X',`$2')
    85  
    86  
    87  ASM_START()
    88  	TEXT
    89  	ALIGN(16)
    90  PROLOGUE(mpn_mul_basecase)
    91  	FUNC_ENTRY(4)
    92  IFDOS(`	mov	56(%rsp), %r8d	')
    93  
    94  	mov	(up), %rax
    95  	mov	(vp), v0
    96  
    97  	cmp	$2, un_param
    98  	ja	L(ge3)
    99  	jz	L(u2)
   100  
   101  	mul	v0			C u0 x v0
   102  	mov	%rax, (rp)
   103  	mov	%rdx, 8(rp)
   104  	FUNC_EXIT()
   105  	ret
   106  
   107  L(u2):	mul	v0			C u0 x v0
   108  	mov	%rax, (rp)
   109  	mov	8(up), %rax
   110  	mov	%rdx, w0
   111  	mul	v0
   112  	add	%rax, w0
   113  	mov	%rdx, w1
   114  	adc	$0, w1
   115  	cmp	$1, R32(vn)
   116  	jnz	L(u2v2)
   117  	mov	w0, 8(rp)
   118  	mov	w1, 16(rp)
   119  	FUNC_EXIT()
   120  	ret
   121  
   122  L(u2v2):mov	8(vp), v0
   123  	mov	(up), %rax
   124  	mul	v0
   125  	add	%rax, w0
   126  	mov	w0, 8(rp)
   127  	mov	%rdx, %r8		C CAUTION: r8 realloc
   128  	adc	$0, %r8
   129  	mov	8(up), %rax
   130  	mul	v0
   131  	add	w1, %r8
   132  	adc	$0, %rdx
   133  	add	%r8, %rax
   134  	adc	$0, %rdx
   135  	mov	%rax, 16(rp)
   136  	mov	%rdx, 24(rp)
   137  	FUNC_EXIT()
   138  	ret
   139  
   140  
   141  L(ge3):	push	%rbx
   142  	push	%rbp
   143  	push	%r12
   144  	push	%r13
   145  
   146  	lea	8(vp), vp
   147  
   148  	lea	-24(rp,un_param,8), rp
   149  	lea	-24(up,un_param,8), up
   150  	xor	R32(un), R32(un)
   151  	mov	$2, R32(n)
   152  	sub	un_param, un
   153  	sub	un_param, n
   154  
   155  	mul	v0
   156  	mov	%rax, w2
   157  	mov	%rdx, w3
   158  	jmp	L(L3)
   159  
   160  	ALIGN(16)
   161  L(top):	mov	w0, -16(rp,n,8)
   162  	add	w1, w2
   163  	adc	$0, w3
   164  	mov	(up,n,8), %rax
   165  	mul	v0
   166  	mov	%rax, w0
   167  	mov	%rdx, w1
   168  	mov	w2, -8(rp,n,8)
   169  	add	w3, w0
   170  	adc	$0, w1
   171  	mov	8(up,n,8), %rax
   172  	mul	v0
   173  	mov	%rax, w2
   174  	mov	%rdx, w3
   175  	mov	w0, (rp,n,8)
   176  	add	w1, w2
   177  	adc	$0, w3
   178  L(L3):	mov	16(up,n,8), %rax
   179  	mul	v0
   180  	mov	%rax, w0
   181  	mov	%rdx, w1
   182  	mov	w2, 8(rp,n,8)
   183  	add	w3, w0
   184  	adc	$0, w1
   185  	mov	24(up,n,8), %rax
   186  	mul	v0
   187  	mov	%rax, w2
   188  	mov	%rdx, w3
   189  	add	$4, n
   190  	js	L(top)
   191  
   192  	mov	w0, -16(rp,n,8)
   193  	add	w1, w2
   194  	adc	$0, w3
   195  
   196  C Switch on n into right addmul_l loop
   197  	test	n, n
   198  	jz	L(r2)
   199  	cmp	$2, R32(n)
   200  	ja	L(r3)
   201  	jz	L(r0)
   202  	jmp	L(r1)
   203  
   204  
   205  L(r3):	mov	w2, X(-8(rp,n,8),16(rp))
   206  	mov	w3, X((rp,n,8),24(rp))
   207  	add	$2, un
   208  
   209  C outer loop(3)
   210  L(to3):	dec	vn
   211  	jz	L(ret)
   212  	mov	(vp), v0
   213  	mov	8(up,un,8), %rax
   214  	lea	8(vp), vp
   215  	lea	8(rp), rp
   216  	mov	un, n
   217  	mul	v0
   218  	mov	%rax, w2
   219  	mov	%rdx, w3
   220  	jmp	L(al3)
   221  
   222  	ALIGN(16)
   223  L(ta3):	add	w0, -16(rp,n,8)
   224  	adc	w1, w2
   225  	adc	$0, w3
   226  	mov	(up,n,8), %rax
   227  	mul	v0
   228  	mov	%rax, w0
   229  	mov	%rdx, w1
   230  	add	w2, -8(rp,n,8)
   231  	adc	w3, w0
   232  	adc	$0, w1
   233  	mov	8(up,n,8), %rax
   234  	mul	v0
   235  	mov	%rax, w2
   236  	mov	%rdx, w3
   237  	add	w0, (rp,n,8)
   238  	adc	w1, w2
   239  	adc	$0, w3
   240  L(al3):	mov	16(up,n,8), %rax
   241  	mul	v0
   242  	mov	%rax, w0
   243  	mov	%rdx, w1
   244  	add	w2, 8(rp,n,8)
   245  	adc	w3, w0
   246  	adc	$0, w1
   247  	mov	24(up,n,8), %rax
   248  	mul	v0
   249  	mov	%rax, w2
   250  	mov	%rdx, w3
   251  	add	$4, n
   252  	js	L(ta3)
   253  
   254  	add	w0, X(-16(rp,n,8),8(rp))
   255  	adc	w1, w2
   256  	adc	$0, w3
   257  	add	w2, X(-8(rp,n,8),16(rp))
   258  	adc	$0, w3
   259  	mov	w3, X((rp,n,8),24(rp))
   260  	jmp	L(to3)
   261  
   262  
   263  L(r2):	mov	X(0(up,n,8),(up)), %rax
   264  	mul	v0
   265  	mov	%rax, w0
   266  	mov	%rdx, w1
   267  	mov	w2, X(-8(rp,n,8),-8(rp))
   268  	add	w3, w0
   269  	adc	$0, w1
   270  	mov	X(8(up,n,8),8(up)), %rax
   271  	mul	v0
   272  	mov	%rax, w2
   273  	mov	%rdx, w3
   274  	mov	w0, X((rp,n,8),(rp))
   275  	add	w1, w2
   276  	adc	$0, w3
   277  	mov	X(16(up,n,8),16(up)), %rax
   278  	mul	v0
   279  	mov	%rax, w0
   280  	mov	%rdx, w1
   281  	mov	w2, X(8(rp,n,8),8(rp))
   282  	add	w3, w0
   283  	adc	$0, w1
   284  	mov	w0, X(16(rp,n,8),16(rp))
   285  	adc	$0, w3
   286  	mov	w1, X(24(rp,n,8),24(rp))
   287  	inc	un
   288  
   289  C outer loop(2)
   290  L(to2):	dec	vn
   291  	jz	L(ret)
   292  	mov	(vp), v0
   293  	mov	16(up,un,8), %rax
   294  	lea	8(vp), vp
   295  	lea	8(rp), rp
   296  	mov	un, n
   297  	mul	v0
   298  	mov	%rax, w0
   299  	mov	%rdx, w1
   300  	jmp	L(al2)
   301  
   302  	ALIGN(16)
   303  L(ta2):	add	w0, -16(rp,n,8)
   304  	adc	w1, w2
   305  	adc	$0, w3
   306  	mov	(up,n,8), %rax
   307  	mul	v0
   308  	mov	%rax, w0
   309  	mov	%rdx, w1
   310  	add	w2, -8(rp,n,8)
   311  	adc	w3, w0
   312  	adc	$0, w1
   313  	mov	8(up,n,8), %rax
   314  	mul	v0
   315  	mov	%rax, w2
   316  	mov	%rdx, w3
   317  	add	w0, (rp,n,8)
   318  	adc	w1, w2
   319  	adc	$0, w3
   320  	mov	16(up,n,8), %rax
   321  	mul	v0
   322  	mov	%rax, w0
   323  	mov	%rdx, w1
   324  	add	w2, 8(rp,n,8)
   325  	adc	w3, w0
   326  	adc	$0, w1
   327  L(al2):	mov	24(up,n,8), %rax
   328  	mul	v0
   329  	mov	%rax, w2
   330  	mov	%rdx, w3
   331  	add	$4, n
   332  	js	L(ta2)
   333  
   334  	add	w0, X(-16(rp,n,8),8(rp))
   335  	adc	w1, w2
   336  	adc	$0, w3
   337  	add	w2, X(-8(rp,n,8),16(rp))
   338  	adc	$0, w3
   339  	mov	w3, X((rp,n,8),24(rp))
   340  	jmp	L(to2)
   341  
   342  
   343  L(r1):	mov	X(0(up,n,8),8(up)), %rax
   344  	mul	v0
   345  	mov	%rax, w0
   346  	mov	%rdx, w1
   347  	mov	w2, X(-8(rp,n,8),(rp))
   348  	add	w3, w0
   349  	adc	$0, w1
   350  	mov	X(8(up,n,8),16(up)), %rax
   351  	mul	v0
   352  	mov	%rax, w2
   353  	mov	%rdx, w3
   354  	mov	w0, X((rp,n,8),8(rp))
   355  	add	w1, w2
   356  	adc	$0, w3
   357  	mov	w2, X(8(rp,n,8),16(rp))
   358  	mov	w3, X(16(rp,n,8),24(rp))
   359  	add	$4, un
   360  
   361  C outer loop(1)
   362  L(to1):	dec	vn
   363  	jz	L(ret)
   364  	mov	(vp), v0
   365  	mov	-8(up,un,8), %rax
   366  	lea	8(vp), vp
   367  	lea	8(rp), rp
   368  	mov	un, n
   369  	mul	v0
   370  	mov	%rax, w2
   371  	mov	%rdx, w3
   372  	jmp	L(al1)
   373  
   374  	ALIGN(16)
   375  L(ta1):	add	w0, -16(rp,n,8)
   376  	adc	w1, w2
   377  	adc	$0, w3
   378  L(al1):	mov	(up,n,8), %rax
   379  	mul	v0
   380  	mov	%rax, w0
   381  	mov	%rdx, w1
   382  	add	w2, -8(rp,n,8)
   383  	adc	w3, w0
   384  	adc	$0, w1
   385  	mov	8(up,n,8), %rax
   386  	mul	v0
   387  	mov	%rax, w2
   388  	mov	%rdx, w3
   389  	add	w0, (rp,n,8)
   390  	adc	w1, w2
   391  	adc	$0, w3
   392  	mov	16(up,n,8), %rax
   393  	mul	v0
   394  	mov	%rax, w0
   395  	mov	%rdx, w1
   396  	add	w2, 8(rp,n,8)
   397  	adc	w3, w0
   398  	adc	$0, w1
   399  	mov	24(up,n,8), %rax
   400  	mul	v0
   401  	mov	%rax, w2
   402  	mov	%rdx, w3
   403  	add	$4, n
   404  	js	L(ta1)
   405  
   406  	add	w0, X(-16(rp,n,8),8(rp))
   407  	adc	w1, w2
   408  	adc	$0, w3
   409  	add	w2, X(-8(rp,n,8),16(rp))
   410  	adc	$0, w3
   411  	mov	w3, X((rp,n,8),24(rp))
   412  	jmp	L(to1)
   413  
   414  
   415  L(r0):	mov	X((up,n,8),16(up)), %rax
   416  	mul	v0
   417  	mov	%rax, w0
   418  	mov	%rdx, w1
   419  	mov	w2, X(-8(rp,n,8),8(rp))
   420  	add	w3, w0
   421  	adc	$0, w1
   422  	mov	w0, X((rp,n,8),16(rp))
   423  	mov	w1, X(8(rp,n,8),24(rp))
   424  	add	$3, un
   425  
   426  C outer loop(0)
   427  L(to0):	dec	vn
   428  	jz	L(ret)
   429  	mov	(vp), v0
   430  	mov	(up,un,8), %rax
   431  	lea	8(vp), vp
   432  	lea	8(rp), rp
   433  	mov	un, n
   434  	mul	v0
   435  	mov	%rax, w0
   436  	mov	%rdx, w1
   437  	jmp	L(al0)
   438  
   439  	ALIGN(16)
   440  L(ta0):	add	w0, -16(rp,n,8)
   441  	adc	w1, w2
   442  	adc	$0, w3
   443  	mov	(up,n,8), %rax
   444  	mul	v0
   445  	mov	%rax, w0
   446  	mov	%rdx, w1
   447  	add	w2, -8(rp,n,8)
   448  	adc	w3, w0
   449  	adc	$0, w1
   450  L(al0):	mov	8(up,n,8), %rax
   451  	mul	v0
   452  	mov	%rax, w2
   453  	mov	%rdx, w3
   454  	add	w0, (rp,n,8)
   455  	adc	w1, w2
   456  	adc	$0, w3
   457  	mov	16(up,n,8), %rax
   458  	mul	v0
   459  	mov	%rax, w0
   460  	mov	%rdx, w1
   461  	add	w2, 8(rp,n,8)
   462  	adc	w3, w0
   463  	adc	$0, w1
   464  	mov	24(up,n,8), %rax
   465  	mul	v0
   466  	mov	%rax, w2
   467  	mov	%rdx, w3
   468  	add	$4, n
   469  	js	L(ta0)
   470  
   471  	add	w0, X(-16(rp,n,8),8(rp))
   472  	adc	w1, w2
   473  	adc	$0, w3
   474  	add	w2, X(-8(rp,n,8),16(rp))
   475  	adc	$0, w3
   476  	mov	w3, X((rp,n,8),24(rp))
   477  	jmp	L(to0)
   478  
   479  
   480  L(ret):	pop	%r13
   481  	pop	%r12
   482  	pop	%rbp
   483  	pop	%rbx
   484  	FUNC_EXIT()
   485  	ret
   486  EPILOGUE()