github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/pentium4/mod_34lsub1.asm (about)

     1  dnl  AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1.
     2  
     3  dnl  Copyright 2000-2002, 2004, 2005, 2007, 2010-2012 Free Software Foundation,
     4  dnl  Inc.
     5  
     6  dnl  This file is part of the GNU MP Library.
     7  dnl
     8  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     9  dnl  it under the terms of either:
    10  dnl
    11  dnl    * the GNU Lesser General Public License as published by the Free
    12  dnl      Software Foundation; either version 3 of the License, or (at your
    13  dnl      option) any later version.
    14  dnl
    15  dnl  or
    16  dnl
    17  dnl    * the GNU General Public License as published by the Free Software
    18  dnl      Foundation; either version 2 of the License, or (at your option) any
    19  dnl      later version.
    20  dnl
    21  dnl  or both in parallel, as here.
    22  dnl
    23  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    24  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    25  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    26  dnl  for more details.
    27  dnl
    28  dnl  You should have received copies of the GNU General Public License and the
    29  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    30  dnl  see https://www.gnu.org/licenses/.
    31  
    32  include(`../config.m4')
    33  
    34  
    35  C	     cycles/limb
    36  C AMD K8,K9	 1.0
    37  C AMD K10	 1.12
    38  C Intel P4	 3.25
    39  C Intel core2	 1.5
    40  C Intel corei	 1.5
    41  C Intel atom	 2.5
    42  C VIA nano	 1.75
    43  
    44  
    45  C INPUT PARAMETERS
    46  define(`ap',	%rdi)
    47  define(`n',	%rsi)
    48  
    49  C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)
    50  
    51  C TODO
    52  C  * Review feed-in and wind-down code.  In particular, try to avoid adc and
    53  C    sbb to placate Pentium4.
    54  C  * It seems possible to reach 2.67 c/l by using a cleaner 6-way unrolling,
    55  C    without the dual loop exits.
    56  
    57  ABI_SUPPORT(DOS64)
    58  ABI_SUPPORT(STD64)
    59  
    60  ASM_START()
    61  	TEXT
    62  	ALIGN(32)
    63  PROLOGUE(mpn_mod_34lsub1)
    64  	FUNC_ENTRY(2)
    65  
    66  	mov	$0x0000FFFFFFFFFFFF, %r11
    67  
    68  	sub	$2, %rsi
    69  	ja	L(gt2)
    70  
    71  	mov	(ap), %rax
    72  	nop
    73  	jb	L(1)
    74  
    75  	mov	8(ap), %rsi
    76  	mov	%rax, %rdx
    77  	shr	$48, %rax		C src[0] low
    78  
    79  	and	%r11, %rdx		C src[0] high
    80  	add	%rdx, %rax
    81  	mov	R32(%rsi), R32(%rdx)
    82  
    83  	shr	$32, %rsi		C src[1] high
    84  	add	%rsi, %rax
    85  
    86  	shl	$16, %rdx		C src[1] low
    87  	add	%rdx, %rax
    88  
    89  L(1):	FUNC_EXIT()
    90  	ret
    91  
    92  
    93  	ALIGN(16)
    94  L(gt2):	xor	R32(%rax), R32(%rax)
    95  	xor	R32(%rcx), R32(%rcx)
    96  	xor	R32(%rdx), R32(%rdx)
    97  	xor	%r8, %r8
    98  	xor	%r9, %r9
    99  	xor	%r10, %r10
   100  
   101  L(top):	add	(ap), %rax
   102  	adc	$0, %r10
   103  	add	8(ap), %rcx
   104  	adc	$0, %r8
   105  	add	16(ap), %rdx
   106  	adc	$0, %r9
   107  
   108  	sub	$3, %rsi
   109  	jng	L(end)
   110  
   111  	add	24(ap), %rax
   112  	adc	$0, %r10
   113  	add	32(ap), %rcx
   114  	adc	$0, %r8
   115  	add	40(ap), %rdx
   116  	lea	48(ap), ap
   117  	adc	$0, %r9
   118  
   119  	sub	$3, %rsi
   120  	jg	L(top)
   121  
   122  
   123  	add	$-24, ap
   124  L(end):	add	%r9, %rax
   125  	adc	%r10, %rcx
   126  	adc	%r8, %rdx
   127  
   128  	inc	%rsi
   129  	mov	$0x1, R32(%r10)
   130  	js	L(combine)
   131  
   132  	mov	$0x10000, R32(%r10)
   133  	adc	24(ap), %rax
   134  	dec	%rsi
   135  	js	L(combine)
   136  
   137  	adc	32(ap), %rcx
   138  	mov	$0x100000000, %r10
   139  
   140  L(combine):
   141  	sbb	%rsi, %rsi		C carry
   142  	mov	%rax, %rdi		C 0mod3
   143  	shr	$48, %rax		C 0mod3 high
   144  
   145  	and	%r10, %rsi		C carry masked
   146  	and	%r11, %rdi		C 0mod3 low
   147  	mov	R32(%rcx), R32(%r10)	C 1mod3
   148  
   149  	add	%rsi, %rax		C apply carry
   150  	shr	$32, %rcx		C 1mod3 high
   151  
   152  	add	%rdi, %rax		C apply 0mod3 low
   153  	movzwl	%dx, R32(%rdi)		C 2mod3
   154  	shl	$16, %r10		C 1mod3 low
   155  
   156  	add	%rcx, %rax		C apply 1mod3 high
   157  	shr	$16, %rdx		C 2mod3 high
   158  
   159  	add	%r10, %rax		C apply 1mod3 low
   160  	shl	$32, %rdi		C 2mod3 low
   161  
   162  	add	%rdx, %rax		C apply 2mod3 high
   163  	add	%rdi, %rax		C apply 2mod3 low
   164  
   165  	FUNC_EXIT()
   166  	ret
   167  EPILOGUE()