github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/k8/redc_1.asm (about)

     1  dnl  X86-64 mpn_redc_1 optimised for AMD K8-K10.
     2  
     3  dnl  Contributed to the GNU project by Torbjörn Granlund.
     4  
     5  dnl  Copyright 2004, 2008, 2013 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  C	     cycles/limb
    36  C AMD K8,K9	 ?
    37  C AMD K10	 ?
    38  C AMD bull	 ?
    39  C AMD pile	 ?
    40  C AMD steam	 ?
    41  C AMD bobcat	 ?
    42  C AMD jaguar	 ?
    43  C Intel P4	 ?
    44  C Intel core	 ?
    45  C Intel NHM	 ?
    46  C Intel SBR	 ?
    47  C Intel IBR	 ?
    48  C Intel HWL	 ?
    49  C Intel BWL	 ?
    50  C Intel atom	 ?
    51  C VIA nano	 ?
    52  
    53  C The inner loops of this code are the result of running a code generation and
    54  C optimisation tool suite written by David Harvey and Torbjörn Granlund.
    55  
    56  C TODO
    57  C  * Micro-optimise, none performed thus far.
    58  C  * This looks different from other current redc_1.asm variants.  Consider
    59  C    adapting this to the mainstream style.
    60  C  * Is this code really faster than more approaches which compute q0 later?
    61  C    Is the use of a jump jump table faster?  Or is the edge of this due to the
    62  C    inlined add_n code?
    63  C  * Put initial m[0] x q0 computation in header.
    64  C  * Put basecases at the file's end, single them out before the pushes.
    65  
    66  define(`rp',          `%rdi')   C rcx
    67  define(`up',          `%rsi')   C rdx
    68  define(`mp_param',    `%rdx')   C r8
    69  define(`n',           `%rcx')   C r9
    70  define(`u0inv',       `%r8')    C stack
    71  
    72  define(`i',           `%r11')
    73  define(`nneg',        `%r12')
    74  define(`mp',          `%r13')
    75  define(`q0',          `%rbp')
    76  define(`vp',          `%rdx')
    77  
    78  ABI_SUPPORT(DOS64)
    79  ABI_SUPPORT(STD64)
    80  
    81  ASM_START()
    82  	TEXT
    83  	ALIGN(32)
    84  PROLOGUE(mpn_redc_1)
    85  	FUNC_ENTRY(4)
    86  IFDOS(`	mov	56(%rsp), %r8	')
    87  	push	%rbp
    88  	mov	(up), q0		C up[0]
    89  	push	%rbx
    90  	imul	u0inv, q0		C first q0, for all execution paths
    91  	push	%r12
    92  	push	%r13
    93  	push	%r14
    94  	push	%r15
    95  
    96  	mov	n, nneg
    97  	neg	nneg
    98  	lea	(mp_param,n,8), mp	C mp += n
    99  	lea	-16(up,n,8), up		C up += n
   100  
   101  	mov	R32(n), R32(%rax)
   102  	and	$3, R32(%rax)
   103  	lea	4(%rax), %r9
   104  	cmp	$4, R32(n)
   105  	cmovg	%r9, %rax
   106  	lea	L(tab)(%rip), %r9
   107  ifdef(`PIC',`
   108  	movslq	(%r9,%rax,4), %rax
   109  	add	%r9, %rax
   110  	jmp	*%rax
   111  ',`
   112  	jmp	*(%r9,%rax,8)
   113  ')
   114  
   115  	JUMPTABSECT
   116  	ALIGN(8)
   117  L(tab):	JMPENT(	L(0), L(tab))
   118  	JMPENT(	L(1), L(tab))
   119  	JMPENT(	L(2), L(tab))
   120  	JMPENT(	L(3), L(tab))
   121  	JMPENT(	L(0m4), L(tab))
   122  	JMPENT(	L(1m4), L(tab))
   123  	JMPENT(	L(2m4), L(tab))
   124  	JMPENT(	L(3m4), L(tab))
   125  	TEXT
   126  
   127  	ALIGN(16)
   128  L(1):	mov	(mp_param), %rax
   129  	mul	q0
   130  	add	8(up), %rax
   131  	adc	16(up), %rdx
   132  	mov	%rdx, (rp)
   133  	mov	$0, R32(%rax)
   134  	adc	R32(%rax), R32(%rax)
   135  	jmp	L(ret)
   136  
   137  
   138  	ALIGN(16)
   139  L(2):	mov	(mp_param), %rax
   140  	mul	q0
   141  	xor	R32(%r14), R32(%r14)
   142  	mov	%rax, %r10
   143  	mov	-8(mp), %rax
   144  	mov	%rdx, %r9
   145  	mul	q0
   146  	add	(up), %r10
   147  	adc	%rax, %r9
   148  	adc	%rdx, %r14
   149  	add	8(up), %r9
   150  	adc	$0, %r14
   151  	mov	%r9, q0
   152  	imul	u0inv, q0
   153  	mov	-16(mp), %rax
   154  	mul	q0
   155  	xor	R32(%rbx), R32(%rbx)
   156  	mov	%rax, %r10
   157  	mov	-8(mp), %rax
   158  	mov	%rdx, %r11
   159  	mul	q0
   160  	add	%r9, %r10
   161  	adc	%rax, %r11
   162  	adc	%rdx, %rbx
   163  	add	16(up), %r11
   164  	adc	$0, %rbx
   165  	xor	R32(%rax), R32(%rax)
   166  	add	%r11, %r14
   167  	adc	24(up), %rbx
   168  	mov	%r14, (rp)
   169  	mov	%rbx, 8(rp)
   170  	adc	R32(%rax), R32(%rax)
   171  	jmp	L(ret)
   172  
   173  
   174  L(3):	mov	(mp_param), %rax
   175  	mul	q0
   176  	mov	%rax, %rbx
   177  	mov	%rdx, %r10
   178  	mov	-16(mp), %rax
   179  	mul	q0
   180  	xor	R32(%r9), R32(%r9)
   181  	xor	R32(%r14), R32(%r14)
   182  	add	-8(up), %rbx
   183  	adc	%rax, %r10
   184  	mov	-8(mp), %rax
   185  	adc	%rdx, %r9
   186  	mul	q0
   187  	add	(up), %r10
   188  	mov	%r10, (up)
   189  	adc	%rax, %r9
   190  	adc	%rdx, %r14
   191  	mov	%r10, q0
   192  	imul	u0inv, q0
   193  	add	%r9, 8(up)
   194  	adc	$0, %r14
   195  	mov	%r14, -8(up)
   196  
   197  	mov	-24(mp), %rax
   198  	mul	q0
   199  	mov	%rax, %rbx
   200  	mov	%rdx, %r10
   201  	mov	-16(mp), %rax
   202  	mul	q0
   203  	xor	R32(%r9), R32(%r9)
   204  	xor	R32(%r14), R32(%r14)
   205  	add	(up), %rbx
   206  	adc	%rax, %r10
   207  	mov	-8(mp), %rax
   208  	adc	%rdx, %r9
   209  	mul	q0
   210  	add	8(up), %r10
   211  	mov	%r10, 8(up)
   212  	adc	%rax, %r9
   213  	adc	%rdx, %r14
   214  	mov	%r10, q0
   215  	imul	u0inv, q0
   216  	add	%r9, 16(up)
   217  	adc	$0, %r14
   218  	mov	%r14, (up)
   219  
   220  	mov	-24(mp), %rax
   221  	mul	q0
   222  	mov	%rax, %rbx
   223  	mov	%rdx, %r10
   224  	mov	-16(mp), %rax
   225  	mul	q0
   226  	xor	R32(%r9), R32(%r9)
   227  	xor	R32(%r14), R32(%r14)
   228  	add	8(up), %rbx
   229  	adc	%rax, %r10
   230  	mov	-8(mp), %rax
   231  	adc	%rdx, %r9
   232  	mul	q0
   233  	add	16(up), %r10
   234  	adc	%rax, %r9
   235  	adc	%rdx, %r14
   236  	add	24(up), %r9
   237  	adc	$0, %r14
   238  
   239  	xor	R32(%rax), R32(%rax)
   240  	add	-8(up), %r10
   241  	adc	(up), %r9
   242  	adc	32(up), %r14
   243  	mov	%r10, (rp)
   244  	mov	%r9, 8(rp)
   245  	mov	%r14, 16(rp)
   246  	adc	R32(%rax), R32(%rax)
   247  	jmp	L(ret)
   248  
   249  
   250  	ALIGN(16)
   251  L(2m4):
   252  L(lo2):	mov	(mp,nneg,8), %rax
   253  	mul	q0
   254  	xor	R32(%r14), R32(%r14)
   255  	xor	R32(%rbx), R32(%rbx)
   256  	mov	%rax, %r10
   257  	mov	8(mp,nneg,8), %rax
   258  	mov	24(up,nneg,8), %r15
   259  	mov	%rdx, %r9
   260  	mul	q0
   261  	add	16(up,nneg,8), %r10
   262  	adc	%rax, %r9
   263  	mov	16(mp,nneg,8), %rax
   264  	adc	%rdx, %r14
   265  	mul	q0
   266  	mov	$0, R32(%r10)		C xor?
   267  	lea	2(nneg), i
   268  	add	%r9, %r15
   269  	imul	u0inv, %r15
   270  	jmp	 L(e2)
   271  
   272  	ALIGN(16)
   273  L(li2):	add	%r10, (up,i,8)
   274  	adc	%rax, %r9
   275  	mov	(mp,i,8), %rax
   276  	adc	%rdx, %r14
   277  	xor	R32(%r10), R32(%r10)
   278  	mul	q0
   279  L(e2):	add	%r9, 8(up,i,8)
   280  	adc	%rax, %r14
   281  	adc	%rdx, %rbx
   282  	mov	8(mp,i,8), %rax
   283  	mul	q0
   284  	add	%r14, 16(up,i,8)
   285  	adc	%rax, %rbx
   286  	adc	%rdx, %r10
   287  	mov	16(mp,i,8), %rax
   288  	mul	q0
   289  	add	%rbx, 24(up,i,8)
   290  	mov	$0, R32(%r14)		C zero
   291  	mov	%r14, %rbx		C zero
   292  	adc	%rax, %r10
   293  	mov	24(mp,i,8), %rax
   294  	mov	%r14, %r9		C zero
   295  	adc	%rdx, %r9
   296  	mul	q0
   297  	add	$4, i
   298  	js	 L(li2)
   299  
   300  L(le2):	add	%r10, (up)
   301  	adc	%rax, %r9
   302  	adc	%r14, %rdx
   303  	add	%r9, 8(up)
   304  	adc	$0, %rdx
   305  	mov	%rdx, 16(up,nneg,8)	C up[0]
   306  	add	$8, up
   307  	mov	%r15, q0
   308  	dec	n
   309  	jnz	L(lo2)
   310  
   311  	mov	nneg, n
   312  	sar	$2, n
   313  	lea	32(up,nneg,8), up
   314  	lea	(up,nneg,8), vp
   315  
   316  	mov	-16(up), %r8
   317  	mov	-8(up), %r9
   318  	add	-16(vp), %r8
   319  	adc	-8(vp), %r9
   320  	mov	%r8, (rp)
   321  	mov	%r9, 8(rp)
   322  	lea	16(rp), rp
   323  	jmp	L(addx)
   324  
   325  
   326  	ALIGN(16)
   327  L(1m4):
   328  L(lo1):	mov	(mp,nneg,8), %rax
   329  	xor	%r9, %r9
   330  	xor	R32(%rbx), R32(%rbx)
   331  	mul	q0
   332  	mov	%rax, %r9
   333  	mov	8(mp,nneg,8), %rax
   334  	mov	24(up,nneg,8), %r15
   335  	mov	%rdx, %r14
   336  	mov	$0, R32(%r10)		C xor?
   337  	mul	q0
   338  	add	16(up,nneg,8), %r9
   339  	adc	%rax, %r14
   340  	adc	%rdx, %rbx
   341  	mov	16(mp,nneg,8), %rax
   342  	mul	q0
   343  	lea	1(nneg), i
   344  	add	%r14, %r15
   345  	imul	u0inv, %r15
   346  	jmp	 L(e1)
   347  
   348  	ALIGN(16)
   349  L(li1):	add	%r10, (up,i,8)
   350  	adc	%rax, %r9
   351  	mov	(mp,i,8), %rax
   352  	adc	%rdx, %r14
   353  	xor	R32(%r10), R32(%r10)
   354  	mul	q0
   355  	add	%r9, 8(up,i,8)
   356  	adc	%rax, %r14
   357  	adc	%rdx, %rbx
   358  	mov	8(mp,i,8), %rax
   359  	mul	q0
   360  L(e1):	add	%r14, 16(up,i,8)
   361  	adc	%rax, %rbx
   362  	adc	%rdx, %r10
   363  	mov	16(mp,i,8), %rax
   364  	mul	q0
   365  	add	%rbx, 24(up,i,8)
   366  	mov	$0, R32(%r14)		C zero
   367  	mov	%r14, %rbx		C zero
   368  	adc	%rax, %r10
   369  	mov	24(mp,i,8), %rax
   370  	mov	%r14, %r9		C zero
   371  	adc	%rdx, %r9
   372  	mul	q0
   373  	add	$4, i
   374  	js	 L(li1)
   375  
   376  L(le1):	add	%r10, (up)
   377  	adc	%rax, %r9
   378  	adc	%r14, %rdx
   379  	add	%r9, 8(up)
   380  	adc	$0, %rdx
   381  	mov	%rdx, 16(up,nneg,8)	C up[0]
   382  	add	$8, up
   383  	mov	%r15, q0
   384  	dec	n
   385  	jnz	L(lo1)
   386  
   387  	mov	nneg, n
   388  	sar	$2, n
   389  	lea	24(up,nneg,8), up
   390  	lea	(up,nneg,8), vp
   391  
   392  	mov	-8(up), %r8
   393  	add	-8(vp), %r8
   394  	mov	%r8, (rp)
   395  	lea	8(rp), rp
   396  	jmp	L(addx)
   397  
   398  
   399  	ALIGN(16)
   400  L(0):
   401  L(0m4):
   402  L(lo0):	mov	(mp,nneg,8), %rax
   403  	mov	nneg, i
   404  	mul	q0
   405  	xor	R32(%r10), R32(%r10)
   406  	mov	%rax, %r14
   407  	mov	%rdx, %rbx
   408  	mov	8(mp,nneg,8), %rax
   409  	mov	24(up,nneg,8), %r15
   410  	mul	q0
   411  	add	16(up,nneg,8), %r14
   412  	adc	%rax, %rbx
   413  	adc	%rdx, %r10
   414  	add	%rbx, %r15
   415  	imul	u0inv, %r15
   416  	jmp	L(e0)
   417  
   418  	ALIGN(16)
   419  L(li0):	add	%r10, (up,i,8)
   420  	adc	%rax, %r9
   421  	mov	(mp,i,8), %rax
   422  	adc	%rdx, %r14
   423  	xor	R32(%r10), R32(%r10)
   424  	mul	q0
   425  	add	%r9, 8(up,i,8)
   426  	adc	%rax, %r14
   427  	adc	%rdx, %rbx
   428  	mov	8(mp,i,8), %rax
   429  	mul	q0
   430  	add	%r14, 16(up,i,8)
   431  	adc	%rax, %rbx
   432  	adc	%rdx, %r10
   433  L(e0):	mov	16(mp,i,8), %rax
   434  	mul	q0
   435  	add	%rbx, 24(up,i,8)
   436  	mov	$0, R32(%r14)		C zero
   437  	mov	%r14, %rbx		C zero
   438  	adc	%rax, %r10
   439  	mov	24(mp,i,8), %rax
   440  	mov	%r14, %r9		C zero
   441  	adc	%rdx, %r9
   442  	mul	q0
   443  	add	$4, i
   444  	js	 L(li0)
   445  
   446  L(le0):	add	%r10, (up)
   447  	adc	%rax, %r9
   448  	adc	%r14, %rdx
   449  	add	%r9, 8(up)
   450  	adc	$0, %rdx
   451  	mov	%rdx, 16(up,nneg,8)	C up[0]
   452  	add	$8, up
   453  	mov	%r15, q0
   454  	dec	n
   455  	jnz	L(lo0)
   456  
   457  	mov	nneg, n
   458  	sar	$2, n
   459  	clc
   460  	lea	16(up,nneg,8), up
   461  	lea	(up,nneg,8), vp
   462  	jmp	L(addy)
   463  
   464  
   465  	ALIGN(16)
   466  L(3m4):
   467  L(lo3):	mov	(mp,nneg,8), %rax
   468  	mul	q0
   469  	mov	%rax, %rbx
   470  	mov	%rdx, %r10
   471  	mov	8(mp,nneg,8), %rax
   472  	mov	24(up,nneg,8), %r15
   473  	mul	q0
   474  	add	16(up,nneg,8), %rbx	C result is zero, might carry
   475  	mov	$0, R32(%rbx)		C zero
   476  	mov	%rbx, %r14		C zero
   477  	adc	%rax, %r10
   478  	mov	16(mp,nneg,8), %rax
   479  	mov	%r14, %r9		C zero
   480  	adc	%rdx, %r9
   481  	add	%r10, %r15
   482  	mul	q0
   483  	lea	3(nneg), i
   484  	imul	u0inv, %r15
   485  C	jmp	L(li3)
   486  
   487  	ALIGN(16)
   488  L(li3):	add	%r10, (up,i,8)
   489  	adc	%rax, %r9
   490  	mov	(mp,i,8), %rax
   491  	adc	%rdx, %r14
   492  	xor	R32(%r10), R32(%r10)
   493  	mul	q0
   494  	add	%r9, 8(up,i,8)
   495  	adc	%rax, %r14
   496  	adc	%rdx, %rbx
   497  	mov	8(mp,i,8), %rax
   498  	mul	q0
   499  	add	%r14, 16(up,i,8)
   500  	adc	%rax, %rbx
   501  	adc	%rdx, %r10
   502  	mov	16(mp,i,8), %rax
   503  	mul	q0
   504  	add	%rbx, 24(up,i,8)
   505  	mov	$0, R32(%r14)		C zero
   506  	mov	%r14, %rbx		C zero
   507  	adc	%rax, %r10
   508  	mov	24(mp,i,8), %rax
   509  	mov	%r14, %r9		C zero
   510  	adc	%rdx, %r9
   511  	mul	q0
   512  	add	$4, i
   513  	js	 L(li3)
   514  
   515  L(le3):	add	%r10, (up)
   516  	adc	%rax, %r9
   517  	adc	%r14, %rdx
   518  	add	%r9, 8(up)
   519  	adc	$0, %rdx
   520  	mov	%rdx, 16(up,nneg,8)	C up[0]
   521  	mov	%r15, q0
   522  	lea	8(up), up
   523  	dec	n
   524  	jnz	L(lo3)
   525  
   526  
   527  C ==== Addition code ====
   528  	mov	nneg, n
   529  	sar	$2, n
   530  	lea	40(up,nneg,8), up
   531  	lea	(up,nneg,8), vp
   532  
   533  	mov	-24(up), %r8
   534  	mov	-16(up), %r9
   535  	mov	-8(up), %r10
   536  	add	-24(vp), %r8
   537  	adc	-16(vp), %r9
   538  	adc	-8(vp), %r10
   539  	mov	%r8, (rp)
   540  	mov	%r9, 8(rp)
   541  	mov	%r10, 16(rp)
   542  	lea	24(rp), rp
   543  
   544  L(addx):inc	n
   545  	jz	L(ad3)
   546  
   547  L(addy):mov	(up), %r8
   548  	mov	8(up), %r9
   549  	inc	n
   550  	jmp	L(mid)
   551  
   552  C	ALIGN(16)
   553  L(al3):	adc	(vp), %r8
   554  	adc	8(vp), %r9
   555  	adc	16(vp), %r10
   556  	adc	24(vp), %r11
   557  	mov	%r8, (rp)
   558  	lea	32(up), up
   559  	mov	%r9, 8(rp)
   560  	mov	%r10, 16(rp)
   561  	inc	n
   562  	mov	%r11, 24(rp)
   563  	lea	32(vp), vp
   564  	mov	(up), %r8
   565  	mov	8(up), %r9
   566  	lea	32(rp), rp
   567  L(mid):	mov	16(up), %r10
   568  	mov	24(up), %r11
   569  	jnz	L(al3)
   570  
   571  L(ae3):	adc	(vp), %r8
   572  	adc	8(vp), %r9
   573  	adc	16(vp), %r10
   574  	adc	24(vp), %r11
   575  	mov	%r8, (rp)
   576  	mov	%r9, 8(rp)
   577  	mov	%r10, 16(rp)
   578  	mov	%r11, 24(rp)
   579  
   580  L(ad3):	mov	R32(n), R32(%rax)	C zero
   581  	adc	R32(%rax), R32(%rax)
   582  
   583  L(ret):	pop	%r15
   584  	pop	%r14
   585  	pop	%r13
   586  	pop	%r12
   587  	pop	%rbx
   588  	pop	%rbp
   589  	FUNC_EXIT()
   590  	ret
   591  EPILOGUE()