github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/bdiv_q_1.asm (about)

     1  dnl  AMD64 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- schoolbook Hensel division by
     2  dnl  1-limb divisor, returning quotient only.
     3  
     4  dnl  Copyright 2001, 2002, 2004-2006, 2009, 2011, 2012 Free Software
     5  dnl  Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  
    36  C	     cycles/limb
    37  C AMD K8,K9	10
    38  C AMD K10	10
    39  C Intel P4	33
    40  C Intel core2	13.25
    41  C Intel corei	14
    42  C Intel atom	42
    43  C VIA nano	 ?
    44  
    45  
    46  C INPUT PARAMETERS
    47  define(`rp',		`%rdi')
    48  define(`up',		`%rsi')
    49  define(`n',		`%rdx')
    50  define(`d',		`%rcx')
    51  define(`di',		`%r8')		C	just mpn_pi1_bdiv_q_1
    52  define(`ncnt',		`%r9')		C	just mpn_pi1_bdiv_q_1
    53  
    54  ABI_SUPPORT(DOS64)
    55  ABI_SUPPORT(STD64)
    56  
    57  ASM_START()
    58  	TEXT
    59  	ALIGN(16)
    60  PROLOGUE(mpn_bdiv_q_1)
    61  	FUNC_ENTRY(4)
    62  	push	%rbx
    63  
    64  	mov	%rcx, %rax
    65  	xor	R32(%rcx), R32(%rcx)	C ncnt count
    66  	mov	%rdx, %r10
    67  
    68  	bt	$0, R32(%rax)
    69  	jnc	L(evn)			C skip bsfq unless divisor is even
    70  
    71  L(odd):	mov	%rax, %rbx
    72  	shr	R32(%rax)
    73  	and	$127, R32(%rax)		C d/2, 7 bits
    74  
    75  	LEA(	binvert_limb_table, %rdx)
    76  
    77  	movzbl	(%rdx,%rax), R32(%rax)	C inv 8 bits
    78  
    79  	mov	%rbx, %r11		C d without twos
    80  
    81  	lea	(%rax,%rax), R32(%rdx)	C 2*inv
    82  	imul	R32(%rax), R32(%rax)	C inv*inv
    83  	imul	R32(%rbx), R32(%rax)	C inv*inv*d
    84  	sub	R32(%rax), R32(%rdx)	C inv = 2*inv - inv*inv*d, 16 bits
    85  
    86  	lea	(%rdx,%rdx), R32(%rax)	C 2*inv
    87  	imul	R32(%rdx), R32(%rdx)	C inv*inv
    88  	imul	R32(%rbx), R32(%rdx)	C inv*inv*d
    89  	sub	R32(%rdx), R32(%rax)	C inv = 2*inv - inv*inv*d, 32 bits
    90  
    91  	lea	(%rax,%rax), %r8	C 2*inv
    92  	imul	%rax, %rax		C inv*inv
    93  	imul	%rbx, %rax		C inv*inv*d
    94  	sub	%rax, %r8		C inv = 2*inv - inv*inv*d, 64 bits
    95  
    96  	jmp	L(com)
    97  
    98  L(evn):	bsf	%rax, %rcx
    99  	shr	R8(%rcx), %rax
   100  	jmp	L(odd)
   101  EPILOGUE()
   102  
   103  PROLOGUE(mpn_pi1_bdiv_q_1)
   104  	FUNC_ENTRY(4)
   105  IFDOS(`	mov	56(%rsp), %r8	')
   106  IFDOS(`	mov	64(%rsp), %r9	')
   107  	push	%rbx
   108  
   109  	mov	%rcx, %r11		C d
   110  	mov	%rdx, %r10		C n
   111  	mov	%r9, %rcx		C ncnt
   112  
   113  L(com):	mov	(up), %rax		C up[0]
   114  
   115  	dec	%r10
   116  	jz	L(one)
   117  
   118  	mov	8(up), %rdx		C up[1]
   119  	lea	(up,%r10,8), up		C up end
   120  	lea	(rp,%r10,8), rp		C rp end
   121  	neg	%r10			C -n
   122  
   123  	shrd	R8(%rcx), %rdx, %rax
   124  
   125  	xor	R32(%rbx), R32(%rbx)
   126  	jmp	L(ent)
   127  
   128  	ALIGN(8)
   129  L(top):
   130  	C rax	q
   131  	C rbx	carry bit, 0 or 1
   132  	C rcx	ncnt
   133  	C rdx
   134  	C r10	counter, limbs, negative
   135  
   136  	mul	%r11			C carry limb in rdx
   137  	mov	(up,%r10,8), %rax
   138  	mov	8(up,%r10,8), %r9
   139  	shrd	R8(%rcx), %r9, %rax
   140  	nop
   141  	sub	%rbx, %rax		C apply carry bit
   142  	setc	R8(%rbx)
   143  	sub	%rdx, %rax		C apply carry limb
   144  	adc	$0, %rbx
   145  L(ent):	imul	%r8, %rax
   146  	mov	%rax, (rp,%r10,8)
   147  	inc	%r10
   148  	jnz	L(top)
   149  
   150  	mul	%r11			C carry limb in rdx
   151  	mov	(up), %rax		C up high limb
   152  	shr	R8(%rcx), %rax
   153  	sub	%rbx, %rax		C apply carry bit
   154  	sub	%rdx, %rax		C apply carry limb
   155  	imul	%r8, %rax
   156  	mov	%rax, (rp)
   157  	pop	%rbx
   158  	FUNC_EXIT()
   159  	ret
   160  
   161  L(one):	shr	R8(%rcx), %rax
   162  	imul	%r8, %rax
   163  	mov	%rax, (rp)
   164  	pop	%rbx
   165  	FUNC_EXIT()
   166  	ret
   167  EPILOGUE()