github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/div_qr_2u_pi1.asm (about)

     1  dnl  x86-64 mpn_div_qr_2u_pi1
     2  dnl  -- Divide an mpn number by an unnormalized 2-limb number,
     3  dnl     using a single-limb inverse and shifting the dividend on the fly.
     4  
     5  dnl  Copyright 2007, 2008, 2010, 2011 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  
    36  C		c/l
    37  C INPUT PARAMETERS
    38  define(`qp',		`%rdi')
    39  define(`rp',		`%rsi')
    40  define(`up_param',	`%rdx')
    41  define(`un_param',	`%rcx') dnl %rcx needed for shift count
    42  define(`d1',		`%r8')
    43  define(`d0',		`%r9')
    44  define(`shift_param',	`FRAME+8(%rsp)')
    45  define(`di_param',	`FRAME+16(%rsp)')
    46  
    47  define(`di',		`%r10')
    48  define(`up',		`%r11')
    49  define(`un',		`%rbp')
    50  define(`u2',		`%rbx')
    51  define(`u1',		`%r12')
    52  define(`u0',		`%rsi') dnl Same as rp, which is saved and restored.
    53  define(`t1',		`%r13')
    54  define(`t0',		`%r14')
    55  define(`md1',		`%r15')
    56  
    57  ASM_START()
    58  	TEXT
    59  	ALIGN(16)
    60  deflit(`FRAME', 0)
    61  PROLOGUE(mpn_div_qr_2u_pi1)
    62  	mov	di_param, di
    63  	mov	up_param, up
    64  	push	%r15
    65  	push	%r14
    66  	push	%r13
    67  	push	%r12
    68  	push	%rbx
    69  	push	%rbp
    70  	push	rp
    71  deflit(`FRAME', 56)
    72  	lea	-2(un_param), un
    73  	mov	d1, md1
    74  	neg	md1
    75  
    76  	C int parameter, 32 bits only
    77  	movl	shift_param, R32(%rcx)
    78  
    79  	C FIXME: Different code for SHLD_SLOW
    80  
    81  	xor	R32(u2), R32(u2)
    82  	mov	8(up, un, 8), u1
    83  	shld	%cl, u1, u2
    84  	C Remains to read (up, un, 8) and shift u1, u0
    85  	C udiv_qr_3by2 (qh,u2,u1,u2,u1,n0, d1,d0,di)
    86  	mov	di, %rax
    87  	mul	u2
    88  	mov	(up, un, 8), u0
    89  	shld	%cl, u0, u1
    90  	mov	u1, t0
    91  	add	%rax, t0	C q0 in t0
    92  	adc	u2, %rdx
    93  	mov	%rdx, t1	C q in t1
    94  	imul	md1, %rdx
    95  	mov	d0, %rax
    96  	lea	(%rdx, u1), u2
    97  	mul	t1
    98  	mov	u0, u1
    99  	shl	%cl, u1
   100  	sub	d0, u1
   101  	sbb	d1, u2
   102  	sub	%rax, u1
   103  	sbb	%rdx, u2
   104  	xor	R32(%rax), R32(%rax)
   105  	xor	R32(%rdx), R32(%rdx)
   106  	cmp	t0, u2
   107  	cmovnc	d0, %rax
   108  	cmovnc	d1, %rdx
   109  	adc	$0, t1
   110  	nop
   111  	add	%rax, u1
   112  	adc	%rdx, u2
   113  	cmp	d1, u2
   114  	jae	L(fix_qh)
   115  L(bck_qh):
   116  	push	t1	C push qh on stack
   117  
   118  	jmp	L(next)
   119  
   120  	ALIGN(16)
   121  L(loop):
   122  	C udiv_qr_3by2 (q,u2,u1,u2,u1,n0, d1,d0,di)
   123  	C Based on the optimized divrem_2.asm code.
   124  
   125  	mov	di, %rax
   126  	mul	u2
   127  	mov	(up, un, 8), u0
   128  	xor	R32(t1), R32(t1)
   129  	shld	%cl, u0, t1
   130  	or	t1, u1
   131  	mov	u1, t0
   132  	add	%rax, t0	C q0 in t0
   133  	adc	u2, %rdx
   134  	mov	%rdx, t1	C q in t1
   135  	imul	md1, %rdx
   136  	mov	d0, %rax
   137  	lea	(%rdx, u1), u2
   138  	mul	t1
   139  	mov	u0, u1
   140  	shl	%cl, u1
   141  	sub	d0, u1
   142  	sbb	d1, u2
   143  	sub	%rax, u1
   144  	sbb	%rdx, u2
   145  	xor	R32(%rax), R32(%rax)
   146  	xor	R32(%rdx), R32(%rdx)
   147  	cmp	t0, u2
   148  	cmovnc	d0, %rax
   149  	cmovnc	d1, %rdx
   150  	adc	$0, t1
   151  	nop
   152  	add	%rax, u1
   153  	adc	%rdx, u2
   154  	cmp	d1, u2
   155  	jae	L(fix)
   156  L(bck):
   157  	mov	t1, (qp, un, 8)
   158  L(next):
   159  	sub	$1, un
   160  	jnc	L(loop)
   161  L(end):
   162  	C qh on stack
   163  	pop	%rax
   164  	pop	rp
   165  	shrd	%cl, u2, u1
   166  	shr	%cl, u2
   167  	mov	u2, 8(rp)
   168  	mov	u1, (rp)
   169  
   170  	pop	%rbp
   171  	pop	%rbx
   172  	pop	%r12
   173  	pop	%r13
   174  	pop	%r14
   175  	pop	%r15
   176  	ret
   177  
   178  L(fix):	C Unlikely update. u2 >= d1
   179  	seta	%dl
   180  	cmp	d0, u1
   181  	setae	%al
   182  	orb	%dl, %al		C "orb" form to placate Sun tools
   183  	je	L(bck)
   184  	inc	t1
   185  	sub	d0, u1
   186  	sbb	d1, u2
   187  	jmp	L(bck)
   188  
   189  C Duplicated, just jumping back to a different address.
   190  L(fix_qh):	C Unlikely update. u2 >= d1
   191  	seta	%dl
   192  	cmp	d0, u1
   193  	setae	%al
   194  	orb	%dl, %al		C "orb" form to placate Sun tools
   195  	je	L(bck_qh)
   196  	inc	t1
   197  	sub	d0, u1
   198  	sbb	d1, u2
   199  	jmp	L(bck_qh)
   200  EPILOGUE()