github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/k8/mullo_basecase.asm

github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/k8/mullo_basecase.asm (about)

     1  dnl  AMD64 mpn_mullo_basecase.
     2  
     3  dnl  Contributed to the GNU project by Torbjorn Granlund.
     4  
     5  dnl  Copyright 2008, 2009, 2011, 2012 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  C The inner loops of this code are the result of running a code generation and
    36  C optimisation tool suite written by David Harvey and Torbjorn Granlund.
    37  
    38  C NOTES
    39  C   * There is a major stupidity in that we call mpn_mul_1 initially, for a
    40  C     large trip count.  Instead, we should start with mul_2 for any operand
    41  C     size congruence class.
    42  C   * Stop iterating addmul_2 earlier, falling into straight-line triangle code
    43  C     for the last 2-3 iterations.
    44  C   * Perhaps implement n=4 special code.
    45  C   * The reload of the outer loop jump address hurts branch prediction.
    46  C   * The addmul_2 loop ends with an MUL whose high part is not used upon loop
    47  C     exit.
    48  
    49  C INPUT PARAMETERS
    50  define(`rp',	   `%rdi')
    51  define(`up',	   `%rsi')
    52  define(`vp_param', `%rdx')
    53  define(`n',	   `%rcx')
    54  
    55  define(`vp',	`%r11')
    56  define(`outer_addr', `%r8')
    57  define(`j',	`%r9')
    58  define(`v0',	`%r13')
    59  define(`v1',	`%r14')
    60  define(`w0',	`%rbx')
    61  define(`w1',	`%r15')
    62  define(`w2',	`%rbp')
    63  define(`w3',	`%r10')
    64  
    65  ABI_SUPPORT(DOS64)
    66  ABI_SUPPORT(STD64)
    67  
    68  ASM_START()
    69  	TEXT
    70  	ALIGN(16)
    71  PROLOGUE(mpn_mullo_basecase)
    72  	FUNC_ENTRY(4)
    73  	cmp	$4, n
    74  	jge	L(gen)
    75  	mov	(up), %rax		C u0
    76  	mov	(vp_param), %r8		C v0
    77  
    78  	lea	L(tab)(%rip), %r9
    79  ifdef(`PIC',
    80  `	movslq	(%r9,%rcx,4), %r10
    81  	add	%r10, %r9
    82  	jmp	*%r9
    83  ',`
    84  	jmp	*(%r9,n,8)
    85  ')
    86  	JUMPTABSECT
    87  	ALIGN(8)
    88  L(tab):	JMPENT(	L(tab), L(tab))			C not allowed
    89  	JMPENT(	L(1), L(tab))			C 1
    90  	JMPENT(	L(2), L(tab))			C 2
    91  	JMPENT(	L(3), L(tab))			C 3
    92  dnl	JMPENT(	L(0m4), L(tab))			C 4
    93  dnl	JMPENT(	L(1m4), L(tab))			C 5
    94  dnl	JMPENT(	L(2m4), L(tab))			C 6
    95  dnl	JMPENT(	L(3m4), L(tab))			C 7
    96  dnl	JMPENT(	L(0m4), L(tab))			C 8
    97  dnl	JMPENT(	L(1m4), L(tab))			C 9
    98  dnl	JMPENT(	L(2m4), L(tab))			C 10
    99  dnl	JMPENT(	L(3m4), L(tab))			C 11
   100  	TEXT
   101  
   102  L(1):	imul	%r8, %rax
   103  	mov	%rax, (rp)
   104  	FUNC_EXIT()
   105  	ret
   106  
   107  L(2):	mov	8(vp_param), %r11
   108  	imul	%rax, %r11		C u0 x v1
   109  	mul	%r8			C u0 x v0
   110  	mov	%rax, (rp)
   111  	imul	8(up), %r8		C u1 x v0
   112  	lea	(%r11, %rdx), %rax
   113  	add	%r8, %rax
   114  	mov	%rax, 8(rp)
   115  	FUNC_EXIT()
   116  	ret
   117  
   118  L(3):	mov	8(vp_param), %r9	C v1
   119  	mov	16(vp_param), %r11
   120  	mul	%r8			C u0 x v0 -> <r1,r0>
   121  	mov	%rax, (rp)		C r0
   122  	mov	(up), %rax		C u0
   123  	mov	%rdx, %rcx		C r1
   124  	mul	%r9			C u0 x v1 -> <r2,r1>
   125  	imul	8(up), %r9		C u1 x v1 -> r2
   126  	mov	16(up), %r10
   127  	imul	%r8, %r10		C u2 x v0 -> r2
   128  	add	%rax, %rcx
   129  	adc	%rdx, %r9
   130  	add	%r10, %r9
   131  	mov	8(up), %rax		C u1
   132  	mul	%r8			C u1 x v0 -> <r2,r1>
   133  	add	%rax, %rcx
   134  	adc	%rdx, %r9
   135  	mov	%r11, %rax
   136  	imul	(up), %rax		C u0 x v2 -> r2
   137  	add	%rax, %r9
   138  	mov	%rcx, 8(rp)
   139  	mov	%r9, 16(rp)
   140  	FUNC_EXIT()
   141  	ret
   142  
   143  L(0m4):
   144  L(1m4):
   145  L(2m4):
   146  L(3m4):
   147  L(gen):	push	%rbx
   148  	push	%rbp
   149  	push	%r13
   150  	push	%r14
   151  	push	%r15
   152  
   153  	mov	(up), %rax
   154  	mov	(vp_param), v0
   155  	mov	vp_param, vp
   156  
   157  	lea	(rp,n,8), rp
   158  	lea	(up,n,8), up
   159  	neg	n
   160  
   161  	mul	v0
   162  
   163  	test	$1, R8(n)
   164  	jz	L(mul_2)
   165  
   166  L(mul_1):
   167  	lea	-8(rp), rp
   168  	lea	-8(up), up
   169  	test	$2, R8(n)
   170  	jnz	L(mul_1_prologue_3)
   171  
   172  L(mul_1_prologue_2):		C n = 7, 11, 15, ...
   173  	lea	-1(n), j
   174  	lea	L(addmul_outer_1)(%rip), outer_addr
   175  	mov	%rax, w0
   176  	mov	%rdx, w1
   177  	xor	R32(w2), R32(w2)
   178  	xor	R32(w3), R32(w3)
   179  	mov	16(up,n,8), %rax
   180  	jmp	L(mul_1_entry_2)
   181  
   182  L(mul_1_prologue_3):		C n = 5, 9, 13, ...
   183  	lea	1(n), j
   184  	lea	L(addmul_outer_3)(%rip), outer_addr
   185  	mov	%rax, w2
   186  	mov	%rdx, w3
   187  	xor	R32(w0), R32(w0)
   188  	jmp	L(mul_1_entry_0)
   189  
   190  	ALIGN(16)
   191  L(mul_1_top):
   192  	mov	w0, -16(rp,j,8)
   193  	add	%rax, w1
   194  	mov	(up,j,8), %rax
   195  	adc	%rdx, w2
   196  	xor	R32(w0), R32(w0)
   197  	mul	v0
   198  	mov	w1, -8(rp,j,8)
   199  	add	%rax, w2
   200  	adc	%rdx, w3
   201  L(mul_1_entry_0):
   202  	mov	8(up,j,8), %rax
   203  	mul	v0
   204  	mov	w2, (rp,j,8)
   205  	add	%rax, w3
   206  	adc	%rdx, w0
   207  	mov	16(up,j,8), %rax
   208  	mul	v0
   209  	mov	w3, 8(rp,j,8)
   210  	xor	R32(w2), R32(w2)	C zero
   211  	mov	w2, w3			C zero
   212  	add	%rax, w0
   213  	mov	24(up,j,8), %rax
   214  	mov	w2, w1			C zero
   215  	adc	%rdx, w1
   216  L(mul_1_entry_2):
   217  	mul	v0
   218  	add	$4, j
   219  	js	L(mul_1_top)
   220  
   221  	mov	w0, -16(rp)
   222  	add	%rax, w1
   223  	mov	w1, -8(rp)
   224  	adc	%rdx, w2
   225  
   226  	imul	(up), v0
   227  	add	v0, w2
   228  	mov	w2, (rp)
   229  
   230  	add	$1, n
   231  	jz	L(ret)
   232  
   233  	mov	8(vp), v0
   234  	mov	16(vp), v1
   235  
   236  	lea	16(up), up
   237  	lea	8(vp), vp
   238  	lea	24(rp), rp
   239  
   240  	jmp	*outer_addr
   241  
   242  
   243  L(mul_2):
   244  	mov	8(vp), v1
   245  	test	$2, R8(n)
   246  	jz	L(mul_2_prologue_3)
   247  
   248  	ALIGN(16)
   249  L(mul_2_prologue_1):
   250  	lea	0(n), j
   251  	mov	%rax, w3
   252  	mov	%rdx, w0
   253  	xor	R32(w1), R32(w1)
   254  	mov	(up,n,8), %rax
   255  	lea	L(addmul_outer_3)(%rip), outer_addr
   256  	jmp	L(mul_2_entry_1)
   257  
   258  	ALIGN(16)
   259  L(mul_2_prologue_3):
   260  	lea	2(n), j
   261  	mov	$0, R32(w3)
   262  	mov	%rax, w1
   263  	mov	(up,n,8), %rax
   264  	mov	%rdx, w2
   265  	lea	L(addmul_outer_1)(%rip), outer_addr
   266  	jmp	L(mul_2_entry_3)
   267  
   268  	ALIGN(16)
   269  L(mul_2_top):
   270  	mov	-32(up,j,8), %rax
   271  	mul	v1
   272  	add	%rax, w0
   273  	adc	%rdx, w1
   274  	mov	-24(up,j,8), %rax
   275  	xor	R32(w2), R32(w2)
   276  	mul	v0
   277  	add	%rax, w0
   278  	mov	-24(up,j,8), %rax
   279  	adc	%rdx, w1
   280  	adc	$0, R32(w2)
   281  	mul	v1
   282  	add	%rax, w1
   283  	mov	w0, -24(rp,j,8)
   284  	adc	%rdx, w2
   285  	mov	-16(up,j,8), %rax
   286  	mul	v0
   287  	mov	$0, R32(w3)
   288  	add	%rax, w1
   289  	adc	%rdx, w2
   290  	mov	-16(up,j,8), %rax
   291  	adc	$0, R32(w3)
   292  L(mul_2_entry_3):
   293  	mov	$0, R32(w0)
   294  	mov	w1, -16(rp,j,8)
   295  	mul	v1
   296  	add	%rax, w2
   297  	mov	-8(up,j,8), %rax
   298  	adc	%rdx, w3
   299  	mov	$0, R32(w1)
   300  	mul	v0
   301  	add	%rax, w2
   302  	mov	-8(up,j,8), %rax
   303  	adc	%rdx, w3
   304  	adc	R32(w1), R32(w0)
   305  	mul	v1
   306  	add	%rax, w3
   307  	mov	w2, -8(rp,j,8)
   308  	adc	%rdx, w0
   309  	mov	(up,j,8), %rax
   310  	mul	v0
   311  	add	%rax, w3
   312  	adc	%rdx, w0
   313  	adc	$0, R32(w1)
   314  L(mul_2_entry_1):
   315  	add	$4, j
   316  	mov	w3, -32(rp,j,8)
   317  	js	L(mul_2_top)
   318  
   319  	imul	-16(up), v1
   320  	add	v1, w0
   321  	imul	-8(up), v0
   322  	add	v0, w0
   323  	mov	w0, -8(rp)
   324  
   325  	add	$2, n
   326  	jz	L(ret)
   327  
   328  	mov	16(vp), v0
   329  	mov	24(vp), v1
   330  
   331  	lea	16(vp), vp
   332  	lea	16(rp), rp
   333  
   334  	jmp	*outer_addr
   335  
   336  
   337  L(addmul_outer_1):
   338  	lea	-2(n), j
   339  	mov	-16(up,n,8), %rax
   340  	mul	v0
   341  	mov	%rax, w3
   342  	mov	-16(up,n,8), %rax
   343  	mov	%rdx, w0
   344  	xor	R32(w1), R32(w1)
   345  	lea	L(addmul_outer_3)(%rip), outer_addr
   346  	jmp	L(addmul_entry_1)
   347  
   348  L(addmul_outer_3):
   349  	lea	0(n), j
   350  	mov	-16(up,n,8), %rax
   351  	xor	R32(w3), R32(w3)
   352  	mul	v0
   353  	mov	%rax, w1
   354  	mov	-16(up,n,8), %rax
   355  	mov	%rdx, w2
   356  	lea	L(addmul_outer_1)(%rip), outer_addr
   357  	jmp	L(addmul_entry_3)
   358  
   359  	ALIGN(16)
   360  L(addmul_top):
   361  	add	w3, -32(rp,j,8)
   362  	adc	%rax, w0
   363  	mov	-24(up,j,8), %rax
   364  	adc	%rdx, w1
   365  	xor	R32(w2), R32(w2)
   366  	mul	v0
   367  	add	%rax, w0
   368  	mov	-24(up,j,8), %rax
   369  	adc	%rdx, w1
   370  	adc	R32(w2), R32(w2)
   371  	mul	v1
   372  	xor	R32(w3), R32(w3)
   373  	add	w0, -24(rp,j,8)
   374  	adc	%rax, w1
   375  	mov	-16(up,j,8), %rax
   376  	adc	%rdx, w2
   377  	mul	v0
   378  	add	%rax, w1
   379  	mov	-16(up,j,8), %rax
   380  	adc	%rdx, w2
   381  	adc	$0, R32(w3)
   382  L(addmul_entry_3):
   383  	mul	v1
   384  	add	w1, -16(rp,j,8)
   385  	adc	%rax, w2
   386  	mov	-8(up,j,8), %rax
   387  	adc	%rdx, w3
   388  	mul	v0
   389  	xor	R32(w0), R32(w0)
   390  	add	%rax, w2
   391  	adc	%rdx, w3
   392  	mov	$0, R32(w1)
   393  	mov	-8(up,j,8), %rax
   394  	adc	R32(w1), R32(w0)
   395  	mul	v1
   396  	add	w2, -8(rp,j,8)
   397  	adc	%rax, w3
   398  	adc	%rdx, w0
   399  	mov	(up,j,8), %rax
   400  	mul	v0
   401  	add	%rax, w3
   402  	mov	(up,j,8), %rax
   403  	adc	%rdx, w0
   404  	adc	$0, R32(w1)
   405  L(addmul_entry_1):
   406  	mul	v1
   407  	add	$4, j
   408  	js	L(addmul_top)
   409  
   410  	add	w3, -32(rp)
   411  	adc	%rax, w0
   412  
   413  	imul	-24(up), v0
   414  	add	v0, w0
   415  	add	w0, -24(rp)
   416  
   417  	add	$2, n
   418  	jns	L(ret)
   419  
   420  	lea	16(vp), vp
   421  
   422  	mov	(vp), v0
   423  	mov	8(vp), v1
   424  
   425  	lea	-16(up), up
   426  
   427  	jmp	*outer_addr
   428  
   429  L(ret):	pop	%r15
   430  	pop	%r14
   431  	pop	%r13
   432  	pop	%rbp
   433  	pop	%rbx
   434  	FUNC_EXIT()
   435  	ret
   436  EPILOGUE()