github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/bobcat/redc_1.asm (about)

     1  dnl  X86-64 mpn_redc_1 optimised for AMD bobcat.
     2  
     3  dnl  Contributed to the GNU project by Torbjörn Granlund.
     4  
     5  dnl  Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  C	     cycles/limb
    36  C AMD K8,K9	 ?
    37  C AMD K10	 ?
    38  C AMD bull	 ?
    39  C AMD pile	 ?
    40  C AMD steam	 ?
    41  C AMD bobcat	 5.0
    42  C AMD jaguar	 ?
    43  C Intel P4	 ?
    44  C Intel core	 ?
    45  C Intel NHM	 ?
    46  C Intel SBR	 ?
    47  C Intel IBR	 ?
    48  C Intel HWL	 ?
    49  C Intel BWL	 ?
    50  C Intel atom	 ?
    51  C VIA nano	 ?
    52  
    53  C TODO
    54  C  * Micro-optimise, none performed thus far.
    55  C  * Consider inlining mpn_add_n.
    56  C  * Single basecases out before the pushes.
    57  
    58  C When playing with pointers, set this to $2 to fall back to conservative
    59  C indexing in wind-down code.
    60  define(`I',`$1')
    61  
    62  define(`rp',          `%rdi')   C rcx
    63  define(`up',          `%rsi')   C rdx
    64  define(`mp_param',    `%rdx')   C r8
    65  define(`n',           `%rcx')   C r9
    66  define(`u0inv',       `%r8')    C stack
    67  
    68  define(`i',           `%r14')
    69  define(`j',           `%r15')
    70  define(`mp',          `%r12')
    71  define(`q0',          `%r13')
    72  define(`w0',          `%rbp')
    73  define(`w1',          `%r9')
    74  define(`w2',          `%r10')
    75  define(`w3',          `%r11')
    76  
    77  C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
    78  
    79  ABI_SUPPORT(DOS64)
    80  ABI_SUPPORT(STD64)
    81  
    82  define(`ALIGNx', `ALIGN(16)')
    83  
    84  ASM_START()
    85  	TEXT
    86  	ALIGN(32)
    87  PROLOGUE(mpn_redc_1)
    88  	FUNC_ENTRY(4)
    89  IFDOS(`	mov	56(%rsp), %r8	')
    90  	push	%rbx
    91  	push	%rbp
    92  	push	%r12
    93  	push	%r13
    94  	push	%r14
    95  	push	%r15
    96  
    97  	mov	(up), q0
    98  	mov	n, j			C outer loop induction var
    99  	lea	(mp_param,n,8), mp
   100  	lea	(up,n,8), up
   101  	neg	n
   102  	imul	u0inv, q0		C first iteration q0
   103  
   104  	test	$1, R8(n)
   105  	jz	L(bx0)
   106  
   107  L(bx1):	test	$2, R8(n)
   108  	jz	L(b3)
   109  
   110  L(b1):	cmp	$-1, R32(n)
   111  	jz	L(n1)
   112  
   113  L(otp1):lea	1(n), i
   114  	mov	(mp,n,8), %rax
   115  	mul	q0
   116  	mov	%rax, w2
   117  	mov	%rdx, w3
   118  	mov	8(mp,n,8), %rax
   119  	mul	q0
   120  	mov	%rax, %rbx
   121  	mov	%rdx, w1
   122  	add	(up,n,8), w2
   123  	adc	w3, %rbx
   124  	adc	$0, w1
   125  	mov	16(mp,n,8), %rax
   126  	mul	q0
   127  	mov	%rax, w2
   128  	mov	%rdx, w3
   129  	add	8(up,n,8), %rbx
   130  	mov	%rbx, 8(up,n,8)
   131  	adc	w1, w2
   132  	adc	$0, w3
   133  	imul	u0inv, %rbx		C next q limb
   134  	jmp	L(e1)
   135  
   136  	ALIGNx
   137  L(tp1):	add	w0, -16(up,i,8)
   138  	adc	w1, w2
   139  	adc	$0, w3
   140  	mov	(mp,i,8), %rax
   141  	mul	q0
   142  	mov	%rax, w0
   143  	mov	%rdx, w1
   144  	add	w2, -8(up,i,8)
   145  	adc	w3, w0
   146  	adc	$0, w1
   147  	mov	8(mp,i,8), %rax
   148  	mul	q0
   149  	mov	%rax, w2
   150  	mov	%rdx, w3
   151  	add	w0, (up,i,8)
   152  	adc	w1, w2
   153  	adc	$0, w3
   154  L(e1):	mov	16(mp,i,8), %rax
   155  	mul	q0
   156  	mov	%rax, w0
   157  	mov	%rdx, w1
   158  	add	w2, 8(up,i,8)
   159  	adc	w3, w0
   160  	adc	$0, w1
   161  	mov	24(mp,i,8), %rax
   162  	mul	q0
   163  	mov	%rax, w2
   164  	mov	%rdx, w3
   165  	add	$4, i
   166  	js	L(tp1)
   167  
   168  L(ed1):	add	w0, I(-16(up),-16(up,i,8))
   169  	adc	w1, w2
   170  	adc	$0, w3
   171  	add	w2, I(-8(up),-8(up,i,8))
   172  	adc	$0, w3
   173  	mov	w3, (up,n,8)		C up[0]
   174  	mov	%rbx, q0		C previously computed q limb -> q0
   175  	lea	8(up), up		C up++
   176  	dec	j
   177  	jnz	L(otp1)
   178  	jmp	L(cj)
   179  
   180  L(b3):	cmp	$-3, R32(n)
   181  	jz	L(n3)
   182  
   183  L(otp3):lea	3(n), i
   184  	mov	(mp,n,8), %rax
   185  	mul	q0
   186  	mov	%rax, w2
   187  	mov	%rdx, w3
   188  	mov	8(mp,n,8), %rax
   189  	mul	q0
   190  	mov	%rax, %rbx
   191  	mov	%rdx, w1
   192  	add	(up,n,8), w2
   193  	adc	w3, %rbx
   194  	adc	$0, w1
   195  	mov	16(mp,n,8), %rax
   196  	mul	q0
   197  	mov	%rax, w2
   198  	mov	%rdx, w3
   199  	add	8(up,n,8), %rbx
   200  	mov	%rbx, 8(up,n,8)
   201  	adc	w1, w2
   202  	adc	$0, w3
   203  	imul	u0inv, %rbx		C next q limb
   204  	jmp	L(e3)
   205  
   206  	ALIGNx
   207  L(tp3):	add	w0, -16(up,i,8)
   208  	adc	w1, w2
   209  	adc	$0, w3
   210  L(e3):	mov	(mp,i,8), %rax
   211  	mul	q0
   212  	mov	%rax, w0
   213  	mov	%rdx, w1
   214  	add	w2, -8(up,i,8)
   215  	adc	w3, w0
   216  	adc	$0, w1
   217  	mov	8(mp,i,8), %rax
   218  	mul	q0
   219  	mov	%rax, w2
   220  	mov	%rdx, w3
   221  	add	w0, (up,i,8)
   222  	adc	w1, w2
   223  	adc	$0, w3
   224  	mov	16(mp,i,8), %rax
   225  	mul	q0
   226  	mov	%rax, w0
   227  	mov	%rdx, w1
   228  	add	w2, 8(up,i,8)
   229  	adc	w3, w0
   230  	adc	$0, w1
   231  	mov	24(mp,i,8), %rax
   232  	mul	q0
   233  	mov	%rax, w2
   234  	mov	%rdx, w3
   235  	add	$4, i
   236  	js	L(tp3)
   237  
   238  L(ed3):	add	w0, I(-16(up),-16(up,i,8))
   239  	adc	w1, w2
   240  	adc	$0, w3
   241  	add	w2, I(-8(up),-8(up,i,8))
   242  	adc	$0, w3
   243  	mov	w3, (up,n,8)		C up[0]
   244  	mov	%rbx, q0		C previously computed q limb -> q0
   245  	lea	8(up), up		C up++
   246  	dec	j
   247  	jnz	L(otp3)
   248  C	jmp	L(cj)
   249  
   250  L(cj):
   251  IFSTD(`	lea	(up,n,8), up		C param 2: up
   252  	lea	(up,n,8), %rdx		C param 3: up - n
   253  	neg	R32(n)		')	C param 4: n
   254  
   255  IFDOS(`	lea	(up,n,8), %rdx		C param 2: up
   256  	lea	(%rdx,n,8), %r8		C param 3: up - n
   257  	neg	R32(n)
   258  	mov	n, %r9			C param 4: n
   259  	mov	rp, %rcx	')	C param 1: rp
   260  
   261  IFSTD(`	sub	$8, %rsp	')
   262  IFDOS(`	sub	$40, %rsp	')
   263  	ASSERT(nz, `test $15, %rsp')
   264  	CALL(	mpn_add_n)
   265  IFSTD(`	add	$8, %rsp	')
   266  IFDOS(`	add	$40, %rsp	')
   267  
   268  L(ret):	pop	%r15
   269  	pop	%r14
   270  	pop	%r13
   271  	pop	%r12
   272  	pop	%rbp
   273  	pop	%rbx
   274  	FUNC_EXIT()
   275  	ret
   276  
   277  L(bx0):	test	$2, R8(n)
   278  	jnz	L(b2)
   279  
   280  L(b0):
   281  L(otp0):lea	(n), i
   282  	mov	(mp,n,8), %rax
   283  	mul	q0
   284  	mov	%rax, w0
   285  	mov	%rdx, w1
   286  	mov	8(mp,n,8), %rax
   287  	mul	q0
   288  	mov	%rax, %rbx
   289  	mov	%rdx, w3
   290  	add	(up,n,8), w0
   291  	adc	w1, %rbx
   292  	adc	$0, w3
   293  	mov	16(mp,n,8), %rax
   294  	mul	q0
   295  	mov	%rax, w0
   296  	mov	%rdx, w1
   297  	add	8(up,n,8), %rbx
   298  	mov	%rbx, 8(up,n,8)
   299  	adc	w3, w0
   300  	adc	$0, w1
   301  	imul	u0inv, %rbx		C next q limb
   302  	jmp	L(e0)
   303  
   304  	ALIGNx
   305  L(tp0):	add	w0, -16(up,i,8)
   306  	adc	w1, w2
   307  	adc	$0, w3
   308  	mov	(mp,i,8), %rax
   309  	mul	q0
   310  	mov	%rax, w0
   311  	mov	%rdx, w1
   312  	add	w2, -8(up,i,8)
   313  	adc	w3, w0
   314  	adc	$0, w1
   315  	mov	8(mp,i,8), %rax
   316  	mul	q0
   317  	mov	%rax, w2
   318  	mov	%rdx, w3
   319  	add	w0, (up,i,8)
   320  	adc	w1, w2
   321  	adc	$0, w3
   322  	mov	16(mp,i,8), %rax
   323  	mul	q0
   324  	mov	%rax, w0
   325  	mov	%rdx, w1
   326  	add	w2, 8(up,i,8)
   327  	adc	w3, w0
   328  	adc	$0, w1
   329  L(e0):	mov	24(mp,i,8), %rax
   330  	mul	q0
   331  	mov	%rax, w2
   332  	mov	%rdx, w3
   333  	add	$4, i
   334  	js	L(tp0)
   335  
   336  L(ed0):	add	w0, I(-16(up),-16(up,i,8))
   337  	adc	w1, w2
   338  	adc	$0, w3
   339  	add	w2, I(-8(up),-8(up,i,8))
   340  	adc	$0, w3
   341  	mov	w3, (up,n,8)		C up[0]
   342  	mov	%rbx, q0		C previously computed q limb -> q0
   343  	lea	8(up), up		C up++
   344  	dec	j
   345  	jnz	L(otp0)
   346  	jmp	L(cj)
   347  
   348  L(b2):	cmp	$-2, R32(n)
   349  	jz	L(n2)
   350  
   351  L(otp2):lea	2(n), i
   352  	mov	(mp,n,8), %rax
   353  	mul	q0
   354  	mov	%rax, w0
   355  	mov	%rdx, w1
   356  	mov	8(mp,n,8), %rax
   357  	mul	q0
   358  	mov	%rax, %rbx
   359  	mov	%rdx, w3
   360  	add	(up,n,8), w0
   361  	adc	w1, %rbx
   362  	adc	$0, w3
   363  	mov	16(mp,n,8), %rax
   364  	mul	q0
   365  	mov	%rax, w0
   366  	mov	%rdx, w1
   367  	add	8(up,n,8), %rbx
   368  	mov	%rbx, 8(up,n,8)
   369  	adc	w3, w0
   370  	adc	$0, w1
   371  	imul	u0inv, %rbx		C next q limb
   372  	jmp	L(e2)
   373  
   374  	ALIGNx
   375  L(tp2):	add	w0, -16(up,i,8)
   376  	adc	w1, w2
   377  	adc	$0, w3
   378  	mov	(mp,i,8), %rax
   379  	mul	q0
   380  	mov	%rax, w0
   381  	mov	%rdx, w1
   382  	add	w2, -8(up,i,8)
   383  	adc	w3, w0
   384  	adc	$0, w1
   385  L(e2):	mov	8(mp,i,8), %rax
   386  	mul	q0
   387  	mov	%rax, w2
   388  	mov	%rdx, w3
   389  	add	w0, (up,i,8)
   390  	adc	w1, w2
   391  	adc	$0, w3
   392  	mov	16(mp,i,8), %rax
   393  	mul	q0
   394  	mov	%rax, w0
   395  	mov	%rdx, w1
   396  	add	w2, 8(up,i,8)
   397  	adc	w3, w0
   398  	adc	$0, w1
   399  	mov	24(mp,i,8), %rax
   400  	mul	q0
   401  	mov	%rax, w2
   402  	mov	%rdx, w3
   403  	add	$4, i
   404  	js	L(tp2)
   405  
   406  L(ed2):	add	w0, I(-16(up),-16(up,i,8))
   407  	adc	w1, w2
   408  	adc	$0, w3
   409  	add	w2, I(-8(up),-8(up,i,8))
   410  	adc	$0, w3
   411  	mov	w3, (up,n,8)		C up[0]
   412  	mov	%rbx, q0		C previously computed q limb -> q0
   413  	lea	8(up), up		C up++
   414  	dec	j
   415  	jnz	L(otp2)
   416  	jmp	L(cj)
   417  
   418  L(n1):	mov	(mp_param), %rax
   419  	mul	q0
   420  	add	-8(up), %rax
   421  	adc	(up), %rdx
   422  	mov	%rdx, (rp)
   423  	mov	$0, R32(%rax)
   424  	adc	R32(%rax), R32(%rax)
   425  	jmp	L(ret)
   426  
   427  L(n2):	mov	(mp_param), %rax
   428  	mov	-16(up), %rbp
   429  	mul	q0
   430  	add	%rax, %rbp
   431  	mov	%rdx, %r9
   432  	adc	$0, %r9
   433  	mov	-8(mp), %rax
   434  	mov	-8(up), %r10
   435  	mul	q0
   436  	add	%rax, %r10
   437  	mov	%rdx, %r11
   438  	adc	$0, %r11
   439  	add	%r9, %r10
   440  	adc	$0, %r11
   441  	mov	%r10, q0
   442  	imul	u0inv, q0		C next q0
   443  	mov	-16(mp), %rax
   444  	mul	q0
   445  	add	%rax, %r10
   446  	mov	%rdx, %r9
   447  	adc	$0, %r9
   448  	mov	-8(mp), %rax
   449  	mov	(up), %r14
   450  	mul	q0
   451  	add	%rax, %r14
   452  	adc	$0, %rdx
   453  	add	%r9, %r14
   454  	adc	$0, %rdx
   455  	xor	R32(%rax), R32(%rax)
   456  	add	%r11, %r14
   457  	adc	8(up), %rdx
   458  	mov	%r14, (rp)
   459  	mov	%rdx, 8(rp)
   460  	adc	R32(%rax), R32(%rax)
   461  	jmp	L(ret)
   462  
   463  	ALIGNx
   464  L(n3):	mov	-24(mp), %rax
   465  	mov	-24(up), %r10
   466  	mul	q0
   467  	add	%rax, %r10
   468  	mov	-16(mp), %rax
   469  	mov	%rdx, %r11
   470  	adc	$0, %r11
   471  	mov	-16(up), %rbp
   472  	mul	q0
   473  	add	%rax, %rbp
   474  	mov	%rdx, %r9
   475  	adc	$0, %r9
   476  	mov	-8(mp), %rax
   477  	add	%r11, %rbp
   478  	mov	-8(up), %r10
   479  	adc	$0, %r9
   480  	mul	q0
   481  	mov	%rbp, q0
   482  	imul	u0inv, q0		C next q0
   483  	add	%rax, %r10
   484  	mov	%rdx, %r11
   485  	adc	$0, %r11
   486  	mov	%rbp, -16(up)
   487  	add	%r9, %r10
   488  	adc	$0, %r11
   489  	mov	%r10, -8(up)
   490  	mov	%r11, -24(up)		C up[0]
   491  	lea	8(up), up		C up++
   492  	dec	j
   493  	jnz	L(n3)
   494  
   495  	mov	-48(up), %rdx
   496  	mov	-40(up), %rbx
   497  	xor	R32(%rax), R32(%rax)
   498  	add	%rbp, %rdx
   499  	adc	%r10, %rbx
   500  	adc	-8(up), %r11
   501  	mov	%rdx, (rp)
   502  	mov	%rbx, 8(rp)
   503  	mov	%r11, 16(rp)
   504  	adc	R32(%rax), R32(%rax)
   505  	jmp	L(ret)
   506  EPILOGUE()
   507  ASM_END()