github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/atom/redc_1.asm (about)

     1  dnl  X86-64 mpn_redc_1 optimised for Intel Atom.
     2  
     3  dnl  Contributed to the GNU project by Torbjörn Granlund.
     4  
     5  dnl  Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  C	     cycles/limb
    36  C AMD K8,K9	 ?
    37  C AMD K10	 ?
    38  C AMD bull	 ?
    39  C AMD pile	 ?
    40  C AMD steam	 ?
    41  C AMD bobcat	 5.0
    42  C AMD jaguar	 ?
    43  C Intel P4	 ?
    44  C Intel core	 ?
    45  C Intel NHM	 ?
    46  C Intel SBR	 ?
    47  C Intel IBR	 ?
    48  C Intel HWL	 ?
    49  C Intel BWL	 ?
    50  C Intel atom	 ?
    51  C VIA nano	 ?
    52  
    53  C TODO
    54  C  * Micro-optimise, none performed thus far.
    55  C  * Consider inlining mpn_add_n.
    56  C  * Single basecases out before the pushes.
    57  C  * Make lead-in code for the inner loops be more similar.
    58  
    59  C When playing with pointers, set this to $2 to fall back to conservative
    60  C indexing in wind-down code.
    61  define(`I',`$1')
    62  
    63  define(`rp',          `%rdi')   C rcx
    64  define(`up',          `%rsi')   C rdx
    65  define(`mp_param',    `%rdx')   C r8
    66  define(`n',           `%rcx')   C r9
    67  define(`u0inv',       `%r8')    C stack
    68  
    69  define(`i',           `%r14')
    70  define(`j',           `%r15')
    71  define(`mp',          `%r12')
    72  define(`q0',          `%r13')
    73  define(`w0',          `%rbp')
    74  define(`w1',          `%r9')
    75  define(`w2',          `%r10')
    76  define(`w3',          `%r11')
    77  
    78  C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
    79  
    80  ABI_SUPPORT(DOS64)
    81  ABI_SUPPORT(STD64)
    82  
    83  define(`ALIGNx', `ALIGN(16)')
    84  
    85  ASM_START()
    86  	TEXT
    87  	ALIGN(32)
    88  PROLOGUE(mpn_redc_1)
    89  	FUNC_ENTRY(4)
    90  IFDOS(`	mov	56(%rsp), %r8	')
    91  	push	%rbx
    92  	push	%rbp
    93  	push	%r12
    94  	push	%r13
    95  	push	%r14
    96  	push	%r15
    97  
    98  	mov	(up), q0
    99  	mov	n, j			C outer loop induction var
   100  	lea	(mp_param,n,8), mp
   101  	lea	(up,n,8), up
   102  	neg	n
   103  	imul	u0inv, q0		C first iteration q0
   104  
   105  	test	$1, R8(n)
   106  	jz	L(bx0)
   107  
   108  L(bx1):	test	$2, R8(n)
   109  	jz	L(b3)
   110  
   111  L(b1):	cmp	$-1, R32(n)
   112  	jz	L(n1)
   113  
   114  L(otp1):lea	1(n), i
   115  	mov	(mp,n,8), %rax
   116  	mul	q0
   117  	mov	%rax, %rbp
   118  	mov	8(mp,n,8), %rax
   119  	mov	%rdx, %r9
   120  	mul	q0
   121  	mov	%rax, %rbx
   122  	mov	16(mp,n,8), %rax
   123  	mov	%rdx, %r10
   124  	mul	q0
   125  	add	(up,n,8), %rbp
   126  	mov	%rax, %rbp
   127  	adc	%r9, %rbx
   128  	mov	24(mp,n,8), %rax
   129  	adc	$0, %r10
   130  	mov	%rdx, %r9
   131  	mul	q0
   132  	add	8(up,n,8), %rbx
   133  	mov	%rbx, 8(up,n,8)
   134  	mov	%rax, %r11
   135  	adc	%r10, %rbp
   136  	mov	32(mp,n,8), %rax
   137  	adc	$0, %r9
   138  	imul	u0inv, %rbx		C next q limb
   139  	jmp	L(e1)
   140  
   141  	ALIGNx
   142  L(tp1):	mul	q0
   143  	add	%rbp, -24(up,i,8)
   144  	mov	%rax, %rbp
   145  	mov	(mp,i,8), %rax
   146  	adc	%r9, %r11
   147  	mov	%rdx, %r9
   148  	adc	$0, %r10
   149  	mul	q0
   150  	add	%r11, -16(up,i,8)
   151  	mov	%rax, %r11
   152  	mov	8(mp,i,8), %rax
   153  	adc	%r10, %rbp
   154  	mov	%rdx, %r10
   155  	adc	$0, %r9
   156  	mul	q0
   157  	add	%rbp, -8(up,i,8)
   158  	mov	%rax, %rbp
   159  	adc	%r9, %r11
   160  	mov	16(mp,i,8), %rax
   161  	adc	$0, %r10
   162  	mov	%rdx, %r9
   163  	mul	q0
   164  	add	%r11, (up,i,8)
   165  	mov	%rax, %r11
   166  	adc	%r10, %rbp
   167  	mov	24(mp,i,8), %rax
   168  	adc	$0, %r9
   169  L(e1):	add	$4, i
   170  	mov	%rdx, %r10
   171  	js	L(tp1)
   172  
   173  L(ed1):	mul	q0
   174  	add	%rbp, I(-24(up),-24(up,i,8))
   175  	adc	%r9, %r11
   176  	adc	$0, %r10
   177  	add	%r11, I(-16(up),-16(up,i,8))
   178  	adc	%r10, %rax
   179  	adc	$0, %rdx
   180  	add	%rax, I(-8(up),-8(up,i,8))
   181  	adc	$0, %rdx
   182  	mov	%rdx, (up,n,8)		C up[0]
   183  	mov	%rbx, q0		C previously computed q limb -> q0
   184  	lea	8(up), up		C up++
   185  	dec	j
   186  	jnz	L(otp1)
   187  	jmp	L(cj)
   188  
   189  L(b3):	cmp	$-3, R32(n)
   190  	jz	L(n3)
   191  
   192  L(otp3):lea	3(n), i
   193  	mov	(mp,n,8), %rax
   194  	mul	q0
   195  	mov	%rax, %rbp
   196  	mov	8(mp,n,8), %rax
   197  	mov	%rdx, %r9
   198  	mul	q0
   199  	mov	%rax, %rbx
   200  	mov	16(mp,n,8), %rax
   201  	mov	%rdx, %r10
   202  	mul	q0
   203  	add	(up,n,8), %rbp
   204  	mov	%rax, %rbp
   205  	mov	24(mp,n,8), %rax
   206  	adc	%r9, %rbx
   207  	mov	%rdx, %r9
   208  	adc	$0, %r10
   209  	mul	q0
   210  	add	8(up,n,8), %rbx
   211  	mov	%rbx, 8(up,n,8)
   212  	mov	%rax, %r11
   213  	mov	32(mp,n,8), %rax
   214  	adc	%r10, %rbp
   215  	mov	%rdx, %r10
   216  	adc	$0, %r9
   217  	imul	u0inv, %rbx		C next q limb
   218  	jmp	L(e3)
   219  
   220  	ALIGNx
   221  L(tp3):	mul	q0
   222  	add	%rbp, -24(up,i,8)
   223  	mov	%rax, %rbp
   224  	mov	(mp,i,8), %rax
   225  	adc	%r9, %r11
   226  	mov	%rdx, %r9
   227  	adc	$0, %r10
   228  	mul	q0
   229  	add	%r11, -16(up,i,8)
   230  	mov	%rax, %r11
   231  	mov	8(mp,i,8), %rax
   232  	adc	%r10, %rbp
   233  	mov	%rdx, %r10
   234  	adc	$0, %r9
   235  L(e3):	mul	q0
   236  	add	%rbp, -8(up,i,8)
   237  	mov	%rax, %rbp
   238  	adc	%r9, %r11
   239  	mov	16(mp,i,8), %rax
   240  	adc	$0, %r10
   241  	mov	%rdx, %r9
   242  	mul	q0
   243  	add	%r11, (up,i,8)
   244  	mov	%rax, %r11
   245  	adc	%r10, %rbp
   246  	mov	24(mp,i,8), %rax
   247  	adc	$0, %r9
   248  	add	$4, i
   249  	mov	%rdx, %r10
   250  	js	L(tp3)
   251  
   252  L(ed3):	mul	q0
   253  	add	%rbp, I(-24(up),-24(up,i,8))
   254  	adc	%r9, %r11
   255  	adc	$0, %r10
   256  	add	%r11, I(-16(up),-16(up,i,8))
   257  	adc	%r10, %rax
   258  	adc	$0, %rdx
   259  	add	%rax, I(-8(up),-8(up,i,8))
   260  	adc	$0, %rdx
   261  	mov	%rdx, (up,n,8)		C up[0]
   262  	mov	%rbx, q0		C previously computed q limb -> q0
   263  	lea	8(up), up		C up++
   264  	dec	j
   265  	jnz	L(otp3)
   266  C	jmp	L(cj)
   267  
   268  L(cj):
   269  IFSTD(`	lea	(up,n,8), up		C param 2: up
   270  	lea	(up,n,8), %rdx		C param 3: up - n
   271  	neg	R32(n)		')	C param 4: n
   272  
   273  IFDOS(`	lea	(up,n,8), %rdx		C param 2: up
   274  	lea	(%rdx,n,8), %r8		C param 3: up - n
   275  	neg	R32(n)
   276  	mov	n, %r9			C param 4: n
   277  	mov	rp, %rcx	')	C param 1: rp
   278  
   279  IFSTD(`	sub	$8, %rsp	')
   280  IFDOS(`	sub	$40, %rsp	')
   281  	ASSERT(nz, `test $15, %rsp')
   282  	CALL(	mpn_add_n)
   283  IFSTD(`	add	$8, %rsp	')
   284  IFDOS(`	add	$40, %rsp	')
   285  
   286  L(ret):	pop	%r15
   287  	pop	%r14
   288  	pop	%r13
   289  	pop	%r12
   290  	pop	%rbp
   291  	pop	%rbx
   292  	FUNC_EXIT()
   293  	ret
   294  
   295  L(bx0):	test	$2, R8(n)
   296  	jnz	L(b2)
   297  
   298  L(b0):	cmp	$-4, R32(n)
   299  	jz	L(n4)
   300  
   301  L(otp0):lea	4(n), i
   302  	mov	(mp,n,8), %rax
   303  	mul	q0
   304  	mov	%rax, %r11
   305  	mov	8(mp,n,8), %rax
   306  	mov	%rdx, %r10
   307  	mul	q0
   308  	mov	%rax, %rbx
   309  	mov	16(mp,n,8), %rax
   310  	mov	%rdx, %r9
   311  	mul	q0
   312  	add	(up,n,8), %r11
   313  	mov	%rax, %r11
   314  	adc	%r10, %rbx
   315  	mov	24(mp,n,8), %rax
   316  	adc	$0, %r9
   317  	mov	%rdx, %r10
   318  	mul	q0
   319  	add	8(up,n,8), %rbx
   320  	mov	%rbx, 8(up,n,8)
   321  	mov	%rax, %rbp
   322  	mov	32(mp,n,8), %rax
   323  	adc	%r9, %r11
   324  	mov	%rdx, %r9
   325  	adc	$0, %r10
   326  	imul	u0inv, %rbx		C next q limb
   327  	jmp	L(e0)
   328  
   329  	ALIGNx
   330  L(tp0):	mul	q0
   331  	add	%rbp, -24(up,i,8)
   332  	mov	%rax, %rbp
   333  	mov	(mp,i,8), %rax
   334  	adc	%r9, %r11
   335  	mov	%rdx, %r9
   336  	adc	$0, %r10
   337  L(e0):	mul	q0
   338  	add	%r11, -16(up,i,8)
   339  	mov	%rax, %r11
   340  	mov	8(mp,i,8), %rax
   341  	adc	%r10, %rbp
   342  	mov	%rdx, %r10
   343  	adc	$0, %r9
   344  	mul	q0
   345  	add	%rbp, -8(up,i,8)
   346  	mov	%rax, %rbp
   347  	adc	%r9, %r11
   348  	mov	16(mp,i,8), %rax
   349  	adc	$0, %r10
   350  	mov	%rdx, %r9
   351  	mul	q0
   352  	add	%r11, (up,i,8)
   353  	mov	%rax, %r11
   354  	adc	%r10, %rbp
   355  	mov	24(mp,i,8), %rax
   356  	adc	$0, %r9
   357  	add	$4, i
   358  	mov	%rdx, %r10
   359  	js	L(tp0)
   360  
   361  L(ed0):	mul	q0
   362  	add	%rbp, I(-24(up),-24(up,i,8))
   363  	adc	%r9, %r11
   364  	adc	$0, %r10
   365  	add	%r11, I(-16(up),-16(up,i,8))
   366  	adc	%r10, %rax
   367  	adc	$0, %rdx
   368  	add	%rax, I(-8(up),-8(up,i,8))
   369  	adc	$0, %rdx
   370  	mov	%rdx, (up,n,8)		C up[0]
   371  	mov	%rbx, q0		C previously computed q limb -> q0
   372  	lea	8(up), up		C up++
   373  	dec	j
   374  	jnz	L(otp0)
   375  	jmp	L(cj)
   376  
   377  L(b2):	cmp	$-2, R32(n)
   378  	jz	L(n2)
   379  
   380  L(otp2):lea	2(n), i
   381  	mov	(mp,n,8), %rax
   382  	mul	q0
   383  	mov	%rax, %r11
   384  	mov	8(mp,n,8), %rax
   385  	mov	%rdx, %r10
   386  	mul	q0
   387  	mov	%rax, %rbx
   388  	mov	16(mp,n,8), %rax
   389  	mov	%rdx, %r9
   390  	mul	q0
   391  	add	(up,n,8), %r11
   392  	mov	%rax, %r11
   393  	adc	%r10, %rbx
   394  	mov	24(mp,n,8), %rax
   395  	adc	$0, %r9
   396  	mov	%rdx, %r10
   397  	mul	q0
   398  	add	8(up,n,8), %rbx
   399  	mov	%rbx, 8(up,n,8)
   400  	mov	%rax, %rbp
   401  	mov	32(mp,n,8), %rax
   402  	adc	%r9, %r11
   403  	mov	%rdx, %r9
   404  	adc	$0, %r10
   405  	imul	u0inv, %rbx		C next q limb
   406  	jmp	L(e2)
   407  
   408  	ALIGNx
   409  L(tp2):	mul	q0
   410  	add	%rbp, -24(up,i,8)
   411  	mov	%rax, %rbp
   412  	mov	(mp,i,8), %rax
   413  	adc	%r9, %r11
   414  	mov	%rdx, %r9
   415  	adc	$0, %r10
   416  	mul	q0
   417  	add	%r11, -16(up,i,8)
   418  	mov	%rax, %r11
   419  	mov	8(mp,i,8), %rax
   420  	adc	%r10, %rbp
   421  	mov	%rdx, %r10
   422  	adc	$0, %r9
   423  	mul	q0
   424  	add	%rbp, -8(up,i,8)
   425  	mov	%rax, %rbp
   426  	adc	%r9, %r11
   427  	mov	16(mp,i,8), %rax
   428  	adc	$0, %r10
   429  	mov	%rdx, %r9
   430  L(e2):	mul	q0
   431  	add	%r11, (up,i,8)
   432  	mov	%rax, %r11
   433  	adc	%r10, %rbp
   434  	mov	24(mp,i,8), %rax
   435  	adc	$0, %r9
   436  	add	$4, i
   437  	mov	%rdx, %r10
   438  	js	L(tp2)
   439  
   440  L(ed2):	mul	q0
   441  	add	%rbp, I(-24(up),-24(up,i,8))
   442  	adc	%r9, %r11
   443  	adc	$0, %r10
   444  	add	%r11, I(-16(up),-16(up,i,8))
   445  	adc	%r10, %rax
   446  	adc	$0, %rdx
   447  	add	%rax, I(-8(up),-8(up,i,8))
   448  	adc	$0, %rdx
   449  	mov	%rdx, (up,n,8)		C up[0]
   450  	mov	%rbx, q0		C previously computed q limb -> q0
   451  	lea	8(up), up		C up++
   452  	dec	j
   453  	jnz	L(otp2)
   454  	jmp	L(cj)
   455  
   456  L(n1):	mov	(mp_param), %rax
   457  	mul	q0
   458  	add	-8(up), %rax
   459  	adc	(up), %rdx
   460  	mov	%rdx, (rp)
   461  	mov	$0, R32(%rax)
   462  	adc	R32(%rax), R32(%rax)
   463  	jmp	L(ret)
   464  
   465  L(n2):	mov	(mp_param), %rax
   466  	mov	-16(up), %rbp
   467  	mul	q0
   468  	add	%rax, %rbp
   469  	mov	%rdx, %r9
   470  	adc	$0, %r9
   471  	mov	-8(mp), %rax
   472  	mov	-8(up), %r10
   473  	mul	q0
   474  	add	%rax, %r10
   475  	mov	%rdx, %r11
   476  	adc	$0, %r11
   477  	add	%r9, %r10
   478  	adc	$0, %r11
   479  	mov	%r10, q0
   480  	imul	u0inv, q0		C next q0
   481  	mov	-16(mp), %rax
   482  	mul	q0
   483  	add	%rax, %r10
   484  	mov	%rdx, %r9
   485  	adc	$0, %r9
   486  	mov	-8(mp), %rax
   487  	mov	(up), %r14
   488  	mul	q0
   489  	add	%rax, %r14
   490  	adc	$0, %rdx
   491  	add	%r9, %r14
   492  	adc	$0, %rdx
   493  	xor	R32(%rax), R32(%rax)
   494  	add	%r11, %r14
   495  	adc	8(up), %rdx
   496  	mov	%r14, (rp)
   497  	mov	%rdx, 8(rp)
   498  	adc	R32(%rax), R32(%rax)
   499  	jmp	L(ret)
   500  
   501  	ALIGNx
   502  L(n3):	mov	-24(mp), %rax
   503  	mov	-24(up), %r10
   504  	mul	q0
   505  	add	%rax, %r10
   506  	mov	-16(mp), %rax
   507  	mov	%rdx, %r11
   508  	adc	$0, %r11
   509  	mov	-16(up), %rbp
   510  	mul	q0
   511  	add	%rax, %rbp
   512  	mov	%rdx, %r9
   513  	adc	$0, %r9
   514  	mov	-8(mp), %rax
   515  	add	%r11, %rbp
   516  	mov	-8(up), %r10
   517  	adc	$0, %r9
   518  	mul	q0
   519  	mov	%rbp, q0
   520  	imul	u0inv, q0		C next q0
   521  	add	%rax, %r10
   522  	mov	%rdx, %r11
   523  	adc	$0, %r11
   524  	mov	%rbp, -16(up)
   525  	add	%r9, %r10
   526  	adc	$0, %r11
   527  	mov	%r10, -8(up)
   528  	mov	%r11, -24(up)		C up[0]
   529  	lea	8(up), up		C up++
   530  	dec	j
   531  	jnz	L(n3)
   532  
   533  	mov	-48(up), %rdx
   534  	mov	-40(up), %rbx
   535  	xor	R32(%rax), R32(%rax)
   536  	add	%rbp, %rdx
   537  	adc	%r10, %rbx
   538  	adc	-8(up), %r11
   539  	mov	%rdx, (rp)
   540  	mov	%rbx, 8(rp)
   541  	mov	%r11, 16(rp)
   542  	adc	R32(%rax), R32(%rax)
   543  	jmp	L(ret)
   544  
   545  L(n4):	mov	-32(mp), %rax
   546  	mul	q0
   547  	mov	%rax, %r11
   548  	mov	-24(mp), %rax
   549  	mov	%rdx, %r10
   550  	mul	q0
   551  	mov	%rax, %rbx
   552  	mov	-16(mp), %rax
   553  	mov	%rdx, %r9
   554  	mul	q0
   555  	add	-32(up), %r11
   556  	mov	%rax, %r11
   557  	adc	%r10, %rbx
   558  	mov	-8(mp), %rax
   559  	adc	$0, %r9
   560  	mov	%rdx, %r10
   561  	mul	q0
   562  	add	-24(up), %rbx
   563  	mov	%rbx, -24(up)
   564  	adc	%r9, %r11
   565  	adc	$0, %r10
   566  	imul	u0inv, %rbx		C next q limb
   567  	add	%r11, -16(up)
   568  	adc	%r10, %rax
   569  	adc	$0, %rdx
   570  	add	%rax, -8(up)
   571  	adc	$0, %rdx
   572  	mov	%rdx, -32(up)		C up[0]
   573  	mov	%rbx, q0		C previously computed q limb -> q0
   574  	dec	j
   575  	lea	8(up), up		C up++
   576  	jnz	L(n4)
   577  	jmp	L(cj)
   578  EPILOGUE()
   579  ASM_END()