github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/k8/mul_basecase.asm

github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/k8/mul_basecase.asm (about)

     1  dnl  AMD64 mpn_mul_basecase.
     2  
     3  dnl  Contributed to the GNU project by Torbjorn Granlund and David Harvey.
     4  
     5  dnl  Copyright 2008, 2012 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  C	     cycles/limb
    36  C AMD K8,K9	 2.375
    37  C AMD K10	 2.375
    38  C Intel P4	15-16
    39  C Intel core2	 4.45
    40  C Intel corei	 4.35
    41  C Intel atom	 ?
    42  C VIA nano	 4.5
    43  
    44  C The inner loops of this code are the result of running a code generation and
    45  C optimization tool suite written by David Harvey and Torbjorn Granlund.
    46  
    47  C TODO
    48  C  * Use fewer registers.  (how??? I can't see it -- david)
    49  C  * Avoid some "mov $0,r" and instead use "xor r,r".
    50  C  * Can the top of each L(addmul_outer_n) prologue be folded into the
    51  C    mul_1/mul_2 prologues, saving a LEA (%rip)? It would slow down the
    52  C    case where vn = 1 or 2; is it worth it?
    53  
    54  C INPUT PARAMETERS
    55  define(`rp',      `%rdi')
    56  define(`up',      `%rsi')
    57  define(`un_param',`%rdx')
    58  define(`vp',      `%rcx')
    59  define(`vn',      `%r8')
    60  
    61  define(`v0', `%r12')
    62  define(`v1', `%r9')
    63  
    64  define(`w0', `%rbx')
    65  define(`w1', `%r15')
    66  define(`w2', `%rbp')
    67  define(`w3', `%r10')
    68  
    69  define(`n',  `%r11')
    70  define(`outer_addr', `%r14')
    71  define(`un',  `%r13')
    72  
    73  ABI_SUPPORT(DOS64)
    74  ABI_SUPPORT(STD64)
    75  
    76  ASM_START()
    77  	TEXT
    78  	ALIGN(16)
    79  PROLOGUE(mpn_mul_basecase)
    80  	FUNC_ENTRY(4)
    81  IFDOS(`	mov	56(%rsp), %r8d	')
    82  	push	%rbx
    83  	push	%rbp
    84  	push	%r12
    85  	push	%r13
    86  	push	%r14
    87  	push	%r15
    88  
    89  	xor	R32(un), R32(un)
    90  	mov	(up), %rax
    91  	mov	(vp), v0
    92  
    93  	sub	un_param, un		C rdx used by mul
    94  	mov	un, n
    95  	mov	R32(un_param), R32(w0)
    96  
    97  	lea	(rp,un_param,8), rp
    98  	lea	(up,un_param,8), up
    99  
   100  	mul	v0
   101  
   102  	test	$1, R8(vn)
   103  	jz	L(mul_2)
   104  
   105  C ===========================================================
   106  C     mul_1 for vp[0] if vn is odd
   107  
   108  L(mul_1):
   109  	and	$3, R32(w0)
   110  	jz	L(mul_1_prologue_0)
   111  	cmp	$2, R32(w0)
   112  	jc	L(mul_1_prologue_1)
   113  	jz	L(mul_1_prologue_2)
   114  
   115  L(mul_1_prologue_3):
   116  	add	$-1, n
   117  	lea	L(addmul_outer_3)(%rip), outer_addr
   118  	mov	%rax, w3
   119  	mov	%rdx, w0
   120  	jmp	L(mul_1_entry_3)
   121  
   122  L(mul_1_prologue_0):
   123  	mov	%rax, w2
   124  	mov	%rdx, w3		C note: already w0 == 0
   125  	lea	L(addmul_outer_0)(%rip), outer_addr
   126  	jmp	L(mul_1_entry_0)
   127  
   128  L(mul_1_prologue_1):
   129  	cmp	$-1, un
   130  	jne	2f
   131  	mov	%rax, -8(rp)
   132  	mov	%rdx, (rp)
   133  	jmp	L(ret)
   134  2:	add	$1, n
   135  	lea	L(addmul_outer_1)(%rip), outer_addr
   136  	mov	%rax, w1
   137  	mov	%rdx, w2
   138  	xor	R32(w3), R32(w3)
   139  	mov	(up,n,8), %rax
   140  	jmp	L(mul_1_entry_1)
   141  
   142  L(mul_1_prologue_2):
   143  	add	$-2, n
   144  	lea	L(addmul_outer_2)(%rip), outer_addr
   145  	mov	%rax, w0
   146  	mov	%rdx, w1
   147  	mov	24(up,n,8), %rax
   148  	xor	R32(w2), R32(w2)
   149  	xor	R32(w3), R32(w3)
   150  	jmp	L(mul_1_entry_2)
   151  
   152  
   153  	C this loop is 10 c/loop = 2.5 c/l on K8, for all up/rp alignments
   154  
   155  	ALIGN(16)
   156  L(mul_1_top):
   157  	mov	w0, -16(rp,n,8)
   158  	add	%rax, w1
   159  	mov	(up,n,8), %rax
   160  	adc	%rdx, w2
   161  L(mul_1_entry_1):
   162  	xor	R32(w0), R32(w0)
   163  	mul	v0
   164  	mov	w1, -8(rp,n,8)
   165  	add	%rax, w2
   166  	adc	%rdx, w3
   167  L(mul_1_entry_0):
   168  	mov	8(up,n,8), %rax
   169  	mul	v0
   170  	mov	w2, (rp,n,8)
   171  	add	%rax, w3
   172  	adc	%rdx, w0
   173  L(mul_1_entry_3):
   174  	mov	16(up,n,8), %rax
   175  	mul	v0
   176  	mov	w3, 8(rp,n,8)
   177  	xor	R32(w2), R32(w2)	C zero
   178  	mov	w2, w3			C zero
   179  	add	%rax, w0
   180  	mov	24(up,n,8), %rax
   181  	mov	w2, w1			C zero
   182  	adc	%rdx, w1
   183  L(mul_1_entry_2):
   184  	mul	v0
   185  	add	$4, n
   186  	js	L(mul_1_top)
   187  
   188  	mov	w0, -16(rp)
   189  	add	%rax, w1
   190  	mov	w1, -8(rp)
   191  	adc	%rdx, w2
   192  	mov	w2, (rp)
   193  
   194  	add	$-1, vn			C vn -= 1
   195  	jz	L(ret)
   196  
   197  	mov	8(vp), v0
   198  	mov	16(vp), v1
   199  
   200  	lea	8(vp), vp		C vp += 1
   201  	lea	8(rp), rp		C rp += 1
   202  
   203  	jmp	*outer_addr
   204  
   205  C ===========================================================
   206  C     mul_2 for vp[0], vp[1] if vn is even
   207  
   208  	ALIGN(16)
   209  L(mul_2):
   210  	mov	8(vp), v1
   211  
   212  	and	$3, R32(w0)
   213  	jz	L(mul_2_prologue_0)
   214  	cmp	$2, R32(w0)
   215  	jz	L(mul_2_prologue_2)
   216  	jc	L(mul_2_prologue_1)
   217  
   218  L(mul_2_prologue_3):
   219  	lea	L(addmul_outer_3)(%rip), outer_addr
   220  	add	$2, n
   221  	mov	%rax, -16(rp,n,8)
   222  	mov	%rdx, w2
   223  	xor	R32(w3), R32(w3)
   224  	xor	R32(w0), R32(w0)
   225  	mov	-16(up,n,8), %rax
   226  	jmp	L(mul_2_entry_3)
   227  
   228  	ALIGN(16)
   229  L(mul_2_prologue_0):
   230  	add	$3, n
   231  	mov	%rax, w0
   232  	mov	%rdx, w1
   233  	xor	R32(w2), R32(w2)
   234  	mov	-24(up,n,8), %rax
   235  	lea	L(addmul_outer_0)(%rip), outer_addr
   236  	jmp	L(mul_2_entry_0)
   237  
   238  	ALIGN(16)
   239  L(mul_2_prologue_1):
   240  	mov	%rax, w3
   241  	mov	%rdx, w0
   242  	xor	R32(w1), R32(w1)
   243  	lea	L(addmul_outer_1)(%rip), outer_addr
   244  	jmp	L(mul_2_entry_1)
   245  
   246  	ALIGN(16)
   247  L(mul_2_prologue_2):
   248  	add	$1, n
   249  	lea	L(addmul_outer_2)(%rip), outer_addr
   250  	mov	$0, R32(w0)
   251  	mov	$0, R32(w1)
   252  	mov	%rax, w2
   253  	mov	-8(up,n,8), %rax
   254  	mov	%rdx, w3
   255  	jmp	L(mul_2_entry_2)
   256  
   257  	C this loop is 18 c/loop = 2.25 c/l on K8, for all up/rp alignments
   258  
   259  	ALIGN(16)
   260  L(mul_2_top):
   261  	mov	-32(up,n,8), %rax
   262  	mul	v1
   263  	add	%rax, w0
   264  	adc	%rdx, w1
   265  	mov	-24(up,n,8), %rax
   266  	xor	R32(w2), R32(w2)
   267  	mul	v0
   268  	add	%rax, w0
   269  	mov	-24(up,n,8), %rax
   270  	adc	%rdx, w1
   271  	adc	$0, R32(w2)
   272  L(mul_2_entry_0):
   273  	mul	v1
   274  	add	%rax, w1
   275  	mov	w0, -24(rp,n,8)
   276  	adc	%rdx, w2
   277  	mov	-16(up,n,8), %rax
   278  	mul	v0
   279  	mov	$0, R32(w3)
   280  	add	%rax, w1
   281  	adc	%rdx, w2
   282  	mov	-16(up,n,8), %rax
   283  	adc	$0, R32(w3)
   284  	mov	$0, R32(w0)
   285  	mov	w1, -16(rp,n,8)
   286  L(mul_2_entry_3):
   287  	mul	v1
   288  	add	%rax, w2
   289  	mov	-8(up,n,8), %rax
   290  	adc	%rdx, w3
   291  	mov	$0, R32(w1)
   292  	mul	v0
   293  	add	%rax, w2
   294  	mov	-8(up,n,8), %rax
   295  	adc	%rdx, w3
   296  	adc	R32(w1), R32(w0)	C adc $0, w0
   297  L(mul_2_entry_2):
   298  	mul	v1
   299  	add	%rax, w3
   300  	mov	w2, -8(rp,n,8)
   301  	adc	%rdx, w0
   302  	mov	(up,n,8), %rax
   303  	mul	v0
   304  	add	%rax, w3
   305  	adc	%rdx, w0
   306  	adc	$0, R32(w1)
   307  L(mul_2_entry_1):
   308  	add	$4, n
   309  	mov	w3, -32(rp,n,8)
   310  	js	L(mul_2_top)
   311  
   312  	mov	-32(up,n,8), %rax	C FIXME: n is constant
   313  	mul	v1
   314  	add	%rax, w0
   315  	mov	w0, (rp)
   316  	adc	%rdx, w1
   317  	mov	w1, 8(rp)
   318  
   319  	add	$-2, vn			C vn -= 2
   320  	jz	L(ret)
   321  
   322  	mov	16(vp), v0
   323  	mov	24(vp), v1
   324  
   325  	lea	16(vp), vp		C vp += 2
   326  	lea	16(rp), rp		C rp += 2
   327  
   328  	jmp	*outer_addr
   329  
   330  
   331  C ===========================================================
   332  C     addmul_2 for remaining vp's
   333  
   334  	C in the following prologues, we reuse un to store the
   335  	C adjusted value of n that is reloaded on each iteration
   336  
   337  L(addmul_outer_0):
   338  	add	$3, un
   339  	lea	0(%rip), outer_addr
   340  
   341  	mov	un, n
   342  	mov	-24(up,un,8), %rax
   343  	mul	v0
   344  	mov	%rax, w0
   345  	mov	-24(up,un,8), %rax
   346  	mov	%rdx, w1
   347  	xor	R32(w2), R32(w2)
   348  	jmp	L(addmul_entry_0)
   349  
   350  L(addmul_outer_1):
   351  	mov	un, n
   352  	mov	(up,un,8), %rax
   353  	mul	v0
   354  	mov	%rax, w3
   355  	mov	(up,un,8), %rax
   356  	mov	%rdx, w0
   357  	xor	R32(w1), R32(w1)
   358  	jmp	L(addmul_entry_1)
   359  
   360  L(addmul_outer_2):
   361  	add	$1, un
   362  	lea	0(%rip), outer_addr
   363  
   364  	mov	un, n
   365  	mov	-8(up,un,8), %rax
   366  	mul	v0
   367  	xor	R32(w0), R32(w0)
   368  	mov	%rax, w2
   369  	xor	R32(w1), R32(w1)
   370  	mov	%rdx, w3
   371  	mov	-8(up,un,8), %rax
   372  	jmp	L(addmul_entry_2)
   373  
   374  L(addmul_outer_3):
   375  	add	$2, un
   376  	lea	0(%rip), outer_addr
   377  
   378  	mov	un, n
   379  	mov	-16(up,un,8), %rax
   380  	xor	R32(w3), R32(w3)
   381  	mul	v0
   382  	mov	%rax, w1
   383  	mov	-16(up,un,8), %rax
   384  	mov	%rdx, w2
   385  	jmp	L(addmul_entry_3)
   386  
   387  	C this loop is 19 c/loop = 2.375 c/l on K8, for all up/rp alignments
   388  
   389  	ALIGN(16)
   390  L(addmul_top):
   391  	add	w3, -32(rp,n,8)
   392  	adc	%rax, w0
   393  	mov	-24(up,n,8), %rax
   394  	adc	%rdx, w1
   395  	xor	R32(w2), R32(w2)
   396  	mul	v0
   397  	add	%rax, w0
   398  	mov	-24(up,n,8), %rax
   399  	adc	%rdx, w1
   400  	adc	R32(w2), R32(w2)	C adc $0, w2
   401  L(addmul_entry_0):
   402  	mul	v1
   403  	xor	R32(w3), R32(w3)
   404  	add	w0, -24(rp,n,8)
   405  	adc	%rax, w1
   406  	mov	-16(up,n,8), %rax
   407  	adc	%rdx, w2
   408  	mul	v0
   409  	add	%rax, w1
   410  	mov	-16(up,n,8), %rax
   411  	adc	%rdx, w2
   412  	adc	$0, R32(w3)
   413  L(addmul_entry_3):
   414  	mul	v1
   415  	add	w1, -16(rp,n,8)
   416  	adc	%rax, w2
   417  	mov	-8(up,n,8), %rax
   418  	adc	%rdx, w3
   419  	mul	v0
   420  	xor	R32(w0), R32(w0)
   421  	add	%rax, w2
   422  	adc	%rdx, w3
   423  	mov	$0, R32(w1)
   424  	mov	-8(up,n,8), %rax
   425  	adc	R32(w1), R32(w0)	C adc $0, w0
   426  L(addmul_entry_2):
   427  	mul	v1
   428  	add	w2, -8(rp,n,8)
   429  	adc	%rax, w3
   430  	adc	%rdx, w0
   431  	mov	(up,n,8), %rax
   432  	mul	v0
   433  	add	%rax, w3
   434  	mov	(up,n,8), %rax
   435  	adc	%rdx, w0
   436  	adc	$0, R32(w1)
   437  L(addmul_entry_1):
   438  	mul	v1
   439  	add	$4, n
   440  	js	L(addmul_top)
   441  
   442  	add	w3, -8(rp)
   443  	adc	%rax, w0
   444  	mov	w0, (rp)
   445  	adc	%rdx, w1
   446  	mov	w1, 8(rp)
   447  
   448  	add	$-2, vn			C vn -= 2
   449  	jz	L(ret)
   450  
   451  	lea	16(rp), rp		C rp += 2
   452  	lea	16(vp), vp		C vp += 2
   453  
   454  	mov	(vp), v0
   455  	mov	8(vp), v1
   456  
   457  	jmp	*outer_addr
   458  
   459  	ALIGN(16)
   460  L(ret):	pop	%r15
   461  	pop	%r14
   462  	pop	%r13
   463  	pop	%r12
   464  	pop	%rbp
   465  	pop	%rbx
   466  	FUNC_EXIT()
   467  	ret
   468  
   469  EPILOGUE()