github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/dive_1.asm (about)

     1  dnl  AMD64 mpn_divexact_1 -- mpn by limb exact division.
     2  
     3  dnl  Copyright 2001, 2002, 2004-2006, 2011, 2012 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C	     cycles/limb
    35  C AMD K8,K9	10
    36  C AMD K10	10
    37  C Intel P4	33
    38  C Intel core2	13.25
    39  C Intel corei	14
    40  C Intel atom	42
    41  C VIA nano	43
    42  
    43  C A quick adoption of the 32-bit K7 code.
    44  
    45  
    46  C INPUT PARAMETERS
    47  C rp		rdi
    48  C up		rsi
    49  C n		rdx
    50  C divisor	rcx
    51  
    52  ABI_SUPPORT(DOS64)
    53  ABI_SUPPORT(STD64)
    54  
    55  ASM_START()
    56  	TEXT
    57  	ALIGN(16)
    58  PROLOGUE(mpn_divexact_1)
    59  	FUNC_ENTRY(4)
    60  	push	%rbx
    61  
    62  	mov	%rcx, %rax
    63  	xor	R32(%rcx), R32(%rcx)	C shift count
    64  	mov	%rdx, %r8
    65  
    66  	bt	$0, R32(%rax)
    67  	jnc	L(evn)			C skip bsfq unless divisor is even
    68  
    69  L(odd):	mov	%rax, %rbx
    70  	shr	R32(%rax)
    71  	and	$127, R32(%rax)		C d/2, 7 bits
    72  
    73  	LEA(	binvert_limb_table, %rdx)
    74  
    75  	movzbl	(%rdx,%rax), R32(%rax)	C inv 8 bits
    76  
    77  	mov	%rbx, %r11		C d without twos
    78  
    79  	lea	(%rax,%rax), R32(%rdx)	C 2*inv
    80  	imul	R32(%rax), R32(%rax)	C inv*inv
    81  	imul	R32(%rbx), R32(%rax)	C inv*inv*d
    82  	sub	R32(%rax), R32(%rdx)	C inv = 2*inv - inv*inv*d, 16 bits
    83  
    84  	lea	(%rdx,%rdx), R32(%rax)	C 2*inv
    85  	imul	R32(%rdx), R32(%rdx)	C inv*inv
    86  	imul	R32(%rbx), R32(%rdx)	C inv*inv*d
    87  	sub	R32(%rdx), R32(%rax)	C inv = 2*inv - inv*inv*d, 32 bits
    88  
    89  	lea	(%rax,%rax), %r10	C 2*inv
    90  	imul	%rax, %rax		C inv*inv
    91  	imul	%rbx, %rax		C inv*inv*d
    92  	sub	%rax, %r10		C inv = 2*inv - inv*inv*d, 64 bits
    93  
    94  	lea	(%rsi,%r8,8), %rsi	C up end
    95  	lea	-8(%rdi,%r8,8), %rdi	C rp end
    96  	neg	%r8			C -n
    97  
    98  	mov	(%rsi,%r8,8), %rax	C up[0]
    99  
   100  	inc	%r8
   101  	jz	L(one)
   102  
   103  	mov	(%rsi,%r8,8), %rdx	C up[1]
   104  
   105  	shrd	R8(%rcx), %rdx, %rax
   106  
   107  	xor	R32(%rbx), R32(%rbx)
   108  	jmp	L(ent)
   109  
   110  L(evn):	bsf	%rax, %rcx
   111  	shr	R8(%rcx), %rax
   112  	jmp	L(odd)
   113  
   114  	ALIGN(8)
   115  L(top):
   116  	C rax	q
   117  	C rbx	carry bit, 0 or 1
   118  	C rcx	shift
   119  	C rdx
   120  	C rsi	up end
   121  	C rdi	rp end
   122  	C r8	counter, limbs, negative
   123  	C r10	d^(-1) mod 2^64
   124  	C r11	d, shifted down
   125  
   126  	mul	%r11			C carry limb in rdx	0 10
   127  	mov	-8(%rsi,%r8,8), %rax	C
   128  	mov	(%rsi,%r8,8), %r9	C
   129  	shrd	R8(%rcx), %r9, %rax	C
   130  	nop				C
   131  	sub	%rbx, %rax		C apply carry bit
   132  	setc	%bl			C
   133  	sub	%rdx, %rax		C apply carry limb	5
   134  	adc	$0, %rbx		C			6
   135  L(ent):	imul	%r10, %rax		C			6
   136  	mov	%rax, (%rdi,%r8,8)	C
   137  	inc	%r8			C
   138  	jnz	L(top)
   139  
   140  	mul	%r11			C carry limb in rdx
   141  	mov	-8(%rsi), %rax		C up high limb
   142  	shr	R8(%rcx), %rax
   143  	sub	%rbx, %rax		C apply carry bit
   144  	sub	%rdx, %rax		C apply carry limb
   145  	imul	%r10, %rax
   146  	mov	%rax, (%rdi)
   147  	pop	%rbx
   148  	FUNC_EXIT()
   149  	ret
   150  
   151  L(one):	shr	R8(%rcx), %rax
   152  	imul	%r10, %rax
   153  	mov	%rax, (%rdi)
   154  	pop	%rbx
   155  	FUNC_EXIT()
   156  	ret
   157  
   158  EPILOGUE()