github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/mod_34lsub1.asm

github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/mod_34lsub1.asm (about)

     1  dnl  AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1.
     2  
     3  dnl  Copyright 2000-2002, 2004, 2005, 2007, 2009-2012 Free Software Foundation,
     4  dnl  Inc.
     5  
     6  dnl  This file is part of the GNU MP Library.
     7  dnl
     8  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     9  dnl  it under the terms of either:
    10  dnl
    11  dnl    * the GNU Lesser General Public License as published by the Free
    12  dnl      Software Foundation; either version 3 of the License, or (at your
    13  dnl      option) any later version.
    14  dnl
    15  dnl  or
    16  dnl
    17  dnl    * the GNU General Public License as published by the Free Software
    18  dnl      Foundation; either version 2 of the License, or (at your option) any
    19  dnl      later version.
    20  dnl
    21  dnl  or both in parallel, as here.
    22  dnl
    23  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    24  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    25  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    26  dnl  for more details.
    27  dnl
    28  dnl  You should have received copies of the GNU General Public License and the
    29  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    30  dnl  see https://www.gnu.org/licenses/.
    31  
    32  include(`../config.m4')
    33  
    34  
    35  C	    cycles/limb
    36  C AMD K8,K9	 0.67	   0.583 is possible with zero-reg instead of $0, 4-way
    37  C AMD K10	 0.67	   this seems hard to beat
    38  C AMD bd1	 1
    39  C AMD bobcat	 1.07
    40  C Intel P4	 7.35	   terrible, use old code
    41  C Intel core2	 1.25	   1+epsilon with huge unrolling
    42  C Intel NHM	 1.15	   this seems hard to beat
    43  C Intel SBR	 0.93
    44  C Intel atom	 2.5
    45  C VIA nano	 1.25	   this seems hard to beat
    46  
    47  C INPUT PARAMETERS
    48  define(`ap',	%rdi)
    49  define(`n',	%rsi)
    50  
    51  C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)
    52  
    53  C TODO
    54  C  * Review feed-in and wind-down code.
    55  
    56  ABI_SUPPORT(DOS64)
    57  ABI_SUPPORT(STD64)
    58  
    59  ASM_START()
    60  	TEXT
    61  	ALIGN(32)
    62  PROLOGUE(mpn_mod_34lsub1)
    63  	FUNC_ENTRY(2)
    64  
    65  	mov	$0x0000FFFFFFFFFFFF, %r11
    66  
    67  	mov	(ap), %rax
    68  
    69  	cmp	$2, %rsi
    70  	ja	L(gt2)
    71  
    72  	jb	L(one)
    73  
    74  	mov	8(ap), %rsi
    75  	mov	%rax, %rdx
    76  	shr	$48, %rax		C src[0] low
    77  
    78  	and	%r11, %rdx		C src[0] high
    79  	add	%rdx, %rax
    80  	mov	R32(%rsi), R32(%rdx)
    81  
    82  	shr	$32, %rsi		C src[1] high
    83  	add	%rsi, %rax
    84  
    85  	shl	$16, %rdx		C src[1] low
    86  	add	%rdx, %rax
    87  L(one):	FUNC_EXIT()
    88  	ret
    89  
    90  
    91  C Don't change this, the wind-down code is not able to handle greater values
    92  define(UNROLL,3)
    93  
    94  L(gt2):	mov	8(ap), %rcx
    95  	mov	16(ap), %rdx
    96  	xor	%r9, %r9
    97  	add	$24, ap
    98  	sub	$eval(UNROLL*3+3), %rsi
    99  	jc	L(end)
   100  	ALIGN(16)
   101  L(top):
   102  	add	(ap), %rax
   103  	adc	8(ap), %rcx
   104  	adc	16(ap), %rdx
   105  	adc	$0, %r9
   106  forloop(i,1,UNROLL-1,`dnl
   107  	add	eval(i*24)(ap), %rax
   108  	adc	eval(i*24+8)(ap), %rcx
   109  	adc	eval(i*24+16)(ap), %rdx
   110  	adc	$0, %r9
   111  ')dnl
   112  	add	$eval(UNROLL*24), ap
   113  	sub	$eval(UNROLL*3), %rsi
   114  	jnc	L(top)
   115  
   116  L(end):
   117  	lea	L(tab)(%rip), %r8
   118  ifdef(`PIC',
   119  `	movslq	36(%r8,%rsi,4), %r10
   120  	add	%r10, %r8
   121  	jmp	*%r8
   122  ',`
   123  	jmp	*72(%r8,%rsi,8)
   124  ')
   125  	JUMPTABSECT
   126  	ALIGN(8)
   127  L(tab):	JMPENT(	L(0), L(tab))
   128  	JMPENT(	L(1), L(tab))
   129  	JMPENT(	L(2), L(tab))
   130  	JMPENT(	L(3), L(tab))
   131  	JMPENT(	L(4), L(tab))
   132  	JMPENT(	L(5), L(tab))
   133  	JMPENT(	L(6), L(tab))
   134  	JMPENT(	L(7), L(tab))
   135  	JMPENT(	L(8), L(tab))
   136  	TEXT
   137  
   138  L(6):	add	(ap), %rax
   139  	adc	8(ap), %rcx
   140  	adc	16(ap), %rdx
   141  	adc	$0, %r9
   142  	add	$24, ap
   143  L(3):	add	(ap), %rax
   144  	adc	8(ap), %rcx
   145  	adc	16(ap), %rdx
   146  	jmp	L(cj1)
   147  
   148  L(7):	add	(ap), %rax
   149  	adc	8(ap), %rcx
   150  	adc	16(ap), %rdx
   151  	adc	$0, %r9
   152  	add	$24, ap
   153  L(4):	add	(ap), %rax
   154  	adc	8(ap), %rcx
   155  	adc	16(ap), %rdx
   156  	adc	$0, %r9
   157  	add	$24, ap
   158  L(1):	add	(ap), %rax
   159  	adc	$0, %rcx
   160  	jmp	L(cj2)
   161  
   162  L(8):	add	(ap), %rax
   163  	adc	8(ap), %rcx
   164  	adc	16(ap), %rdx
   165  	adc	$0, %r9
   166  	add	$24, ap
   167  L(5):	add	(ap), %rax
   168  	adc	8(ap), %rcx
   169  	adc	16(ap), %rdx
   170  	adc	$0, %r9
   171  	add	$24, ap
   172  L(2):	add	(ap), %rax
   173  	adc	8(ap), %rcx
   174  
   175  L(cj2):	adc	$0, %rdx
   176  L(cj1):	adc	$0, %r9
   177  L(0):	add	%r9, %rax
   178  	adc	$0, %rcx
   179  	adc	$0, %rdx
   180  	adc	$0, %rax
   181  
   182  	mov	%rax, %rdi		C 0mod3
   183  	shr	$48, %rax		C 0mod3 high
   184  
   185  	and	%r11, %rdi		C 0mod3 low
   186  	mov	R32(%rcx), R32(%r10)	C 1mod3
   187  
   188  	shr	$32, %rcx		C 1mod3 high
   189  
   190  	add	%rdi, %rax		C apply 0mod3 low
   191  	movzwl	%dx, R32(%rdi)		C 2mod3
   192  	shl	$16, %r10		C 1mod3 low
   193  
   194  	add	%rcx, %rax		C apply 1mod3 high
   195  	shr	$16, %rdx		C 2mod3 high
   196  
   197  	add	%r10, %rax		C apply 1mod3 low
   198  	shl	$32, %rdi		C 2mod3 low
   199  
   200  	add	%rdx, %rax		C apply 2mod3 high
   201  	add	%rdi, %rax		C apply 2mod3 low
   202  
   203  	FUNC_EXIT()
   204  	ret
   205  EPILOGUE()