github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/mode1o.asm (about)

     1  dnl  AMD64 mpn_modexact_1_odd -- Hensel norm remainder.
     2  
     3  dnl  Copyright 2000-2006, 2011, 2012 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C	     cycles/limb
    35  C AMD K8,K9	10
    36  C AMD K10	10
    37  C Intel P4	33
    38  C Intel core2	13
    39  C Intel corei	14.5
    40  C Intel atom	35
    41  C VIA nano	 ?
    42  
    43  
    44  C The dependent chain in the main loop is
    45  C
    46  C                            cycles
    47  C	sub	%rdx, %rax	1
    48  C	imul	%r9, %rax	4
    49  C	mul	%r8		5
    50  C			      ----
    51  C       total		       10
    52  C
    53  C The mov load from src seems to need to be scheduled back before the jz to
    54  C achieve this speed, out-of-order execution apparently can't completely hide
    55  C the latency otherwise.
    56  C
    57  C The l=src[i]-cbit step is rotated back too, since that allows us to avoid it
    58  C for the first iteration (where there's no cbit).
    59  C
    60  C The code alignment used (32-byte) for the loop also seems necessary.  Without
    61  C that the non-PIC case has adc crossing the 0x60 offset, apparently making it
    62  C run at 11 cycles instead of 10.
    63  
    64  
    65  ABI_SUPPORT(DOS64)
    66  ABI_SUPPORT(STD64)
    67  
    68  ASM_START()
    69  	TEXT
    70  	ALIGN(32)
    71  PROLOGUE(mpn_modexact_1_odd)
    72  	FUNC_ENTRY(3)
    73  	mov	$0, R32(%rcx)
    74  IFDOS(`	jmp	L(ent)		')
    75  
    76  PROLOGUE(mpn_modexact_1c_odd)
    77  	FUNC_ENTRY(4)
    78  L(ent):
    79  	C rdi	src
    80  	C rsi	size
    81  	C rdx	divisor
    82  	C rcx	carry
    83  
    84  	mov	%rdx, %r8		C d
    85  	shr	R32(%rdx)		C d/2
    86  
    87  	LEA(	binvert_limb_table, %r9)
    88  
    89  	and	$127, R32(%rdx)
    90  	mov	%rcx, %r10		C initial carry
    91  
    92  	movzbl	(%r9,%rdx), R32(%rdx)	C inv 8 bits
    93  
    94  	mov	(%rdi), %rax		C src[0]
    95  	lea	(%rdi,%rsi,8), %r11	C src end
    96  	mov	%r8, %rdi		C d, made available to imull
    97  
    98  	lea	(%rdx,%rdx), R32(%rcx)	C 2*inv
    99  	imul	R32(%rdx), R32(%rdx)	C inv*inv
   100  
   101  	neg	%rsi			C -size
   102  
   103  	imul	R32(%rdi), R32(%rdx)	C inv*inv*d
   104  
   105  	sub	R32(%rdx), R32(%rcx)	C inv = 2*inv - inv*inv*d, 16 bits
   106  
   107  	lea	(%rcx,%rcx), R32(%rdx)	C 2*inv
   108  	imul	R32(%rcx), R32(%rcx)	C inv*inv
   109  
   110  	imul	R32(%rdi), R32(%rcx)	C inv*inv*d
   111  
   112  	sub	R32(%rcx), R32(%rdx)	C inv = 2*inv - inv*inv*d, 32 bits
   113  	xor	R32(%rcx), R32(%rcx)	C initial cbit
   114  
   115  	lea	(%rdx,%rdx), %r9	C 2*inv
   116  	imul	%rdx, %rdx		C inv*inv
   117  
   118  	imul	%r8, %rdx		C inv*inv*d
   119  
   120  	sub	%rdx, %r9		C inv = 2*inv - inv*inv*d, 64 bits
   121  	mov	%r10, %rdx		C initial climb
   122  
   123  	ASSERT(e,`	C d*inv == 1 mod 2^64
   124  	mov	%r8, %r10
   125  	imul	%r9, %r10
   126  	cmp	$1, %r10')
   127  
   128  	inc	%rsi
   129  	jz	L(one)
   130  
   131  
   132  	ALIGN(16)
   133  L(top):
   134  	C rax	l = src[i]-cbit
   135  	C rcx	new cbit, 0 or 1
   136  	C rdx	climb, high of last product
   137  	C rsi	counter, limbs, negative
   138  	C rdi
   139  	C r8	divisor
   140  	C r9	inverse
   141  	C r11	src end ptr
   142  
   143  	sub	%rdx, %rax		C l = src[i]-cbit - climb
   144  
   145  	adc	$0, %rcx		C more cbit
   146  	imul	%r9, %rax		C q = l * inverse
   147  
   148  	mul	%r8			C climb = high (q * d)
   149  
   150  	mov	(%r11,%rsi,8), %rax	C src[i+1]
   151  	sub	%rcx, %rax		C next l = src[i+1] - cbit
   152  	setc	R8(%rcx)		C new cbit
   153  
   154  	inc	%rsi
   155  	jnz	L(top)
   156  
   157  
   158  L(one):
   159  	sub	%rdx, %rax		C l = src[i]-cbit - climb
   160  
   161  	adc	$0, %rcx		C more cbit
   162  	imul	%r9, %rax		C q = l * inverse
   163  
   164  	mul	%r8			C climb = high (q * d)
   165  
   166  	lea	(%rcx,%rdx), %rax	C climb+cbit
   167  	FUNC_EXIT()
   168  	ret
   169  
   170  EPILOGUE(mpn_modexact_1c_odd)
   171  EPILOGUE(mpn_modexact_1_odd)