github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/k8/div_qr_1n_pi1.asm

github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/k8/div_qr_1n_pi1.asm (about)

     1  dnl  x86-64 mpn_div_qr_1n_pi1
     2  dnl  -- Divide an mpn number by a normalized single-limb number,
     3  dnl     using a single-limb inverse.
     4  
     5  dnl  Contributed to the GNU project by Niels Möller
     6  
     7  dnl  Copyright 2013 Free Software Foundation, Inc.
     8  
     9  dnl  This file is part of the GNU MP Library.
    10  dnl
    11  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    12  dnl  it under the terms of either:
    13  dnl
    14  dnl    * the GNU Lesser General Public License as published by the Free
    15  dnl      Software Foundation; either version 3 of the License, or (at your
    16  dnl      option) any later version.
    17  dnl
    18  dnl  or
    19  dnl
    20  dnl    * the GNU General Public License as published by the Free Software
    21  dnl      Foundation; either version 2 of the License, or (at your option) any
    22  dnl      later version.
    23  dnl
    24  dnl  or both in parallel, as here.
    25  dnl
    26  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    27  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    28  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    29  dnl  for more details.
    30  dnl
    31  dnl  You should have received copies of the GNU General Public License and the
    32  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    33  dnl  see https://www.gnu.org/licenses/.
    34  
    35  include(`../config.m4')
    36  
    37  
    38  C		c/l
    39  C AMD K8,K9	11
    40  C AMD K10	11
    41  C AMD bull	16
    42  C AMD pile	14.25
    43  C AMD steam	 ?
    44  C AMD bobcat	16
    45  C AMD jaguar	 ?
    46  C Intel P4	47.5	poor
    47  C Intel core	28.5	very poor
    48  C Intel NHM	29	very poor
    49  C Intel SBR	16	poor
    50  C Intel IBR	13.5
    51  C Intel HWL	12
    52  C Intel BWL	 ?
    53  C Intel atom	53	very poor
    54  C VIA nano	19
    55  
    56  
    57  C INPUT Parameters
    58  define(`QP', `%rdi')
    59  define(`UP', `%rsi')
    60  define(`UN_INPUT', `%rdx')
    61  define(`U1', `%rcx')	C Also in %rax
    62  define(`D', `%r8')
    63  define(`DINV', `%r9')
    64  
    65  C Invariants
    66  define(`B2', `%rbp')
    67  define(`B2md', `%rbx')
    68  
    69  C Variables
    70  define(`UN', `%r8')	C Overlaps D input
    71  define(`T', `%r10')
    72  define(`U0', `%r11')
    73  define(`U2', `%r12')
    74  define(`Q0', `%r13')
    75  define(`Q1', `%r14')
    76  define(`Q2', `%r15')
    77  
    78  ABI_SUPPORT(STD64)
    79  
    80  	ASM_START()
    81  	TEXT
    82  	ALIGN(16)
    83  PROLOGUE(mpn_div_qr_1n_pi1)
    84  	FUNC_ENTRY(6)
    85  IFDOS(`	mov	56(%rsp), %r8	')
    86  IFDOS(`	mov	64(%rsp), %r9	')
    87  	dec	UN_INPUT
    88  	jnz	L(first)
    89  
    90  	C Just a single 2/1 division.
    91  	C T, U0 are allocated in scratch registers
    92  	lea	1(U1), T
    93  	mov	U1, %rax
    94  	mul	DINV
    95  	mov	(UP), U0
    96  	add	U0, %rax
    97  	adc	T, %rdx
    98  	mov	%rdx, T
    99  	imul	D, %rdx
   100  	sub	%rdx, U0
   101  	cmp	U0, %rax
   102  	lea	(U0, D), %rax
   103  	cmovnc	U0, %rax
   104  	sbb	$0, T
   105  	cmp	D, %rax
   106  	jc	L(single_div_done)
   107  	sub	D, %rax
   108  	add	$1, T
   109  L(single_div_done):
   110  	mov	T, (QP)
   111  	FUNC_EXIT
   112  	ret
   113  L(first):
   114  	C FIXME: Could delay some of these until we enter the loop.
   115  	push	%r15
   116  	push	%r14
   117  	push	%r13
   118  	push	%r12
   119  	push	%rbx
   120  	push	%rbp
   121  
   122  	mov	D, B2
   123  	imul	DINV, B2
   124  	neg	B2
   125  	mov	B2, B2md
   126  	sub	D, B2md
   127  
   128  	C D not needed until final reduction
   129  	push	D
   130  	mov	UN_INPUT, UN	C Clobbers D
   131  
   132  	mov	DINV, %rax
   133  	mul	U1
   134  	mov	%rax, Q0
   135  	add	U1, %rdx
   136  	mov	%rdx, T
   137  
   138  	mov	B2, %rax
   139  	mul	U1
   140  	mov	-8(UP, UN, 8), U0
   141  	mov	(UP, UN, 8), U1
   142  	mov	T, (QP, UN, 8)
   143  	add	%rax, U0
   144  	adc	%rdx, U1
   145  	sbb	U2, U2
   146  	dec	UN
   147  	mov	U1, %rax
   148  	jz	L(final)
   149  	mov	$0, R32(Q1)
   150  
   151  	ALIGN(16)
   152  
   153  	C Loop is 28 instructions, 30 K8/K10 decoder slots, should run
   154  	C in 10 cycles. At entry, %rax holds an extra copy of U1, Q1
   155  	C is zero, and carry holds an extra copy of U2.
   156  L(loop):
   157  	C {Q2, Q1, Q0} <-- DINV * U1 + B (Q0 + U2 DINV) + B^2 U2
   158  	C Remains to add in B (U1 + c)
   159  	cmovc	DINV, Q1
   160  	mov	U2, Q2
   161  	neg	Q2
   162  	mul	DINV
   163  	add	%rdx, Q1
   164  	adc	$0, Q2
   165  	add	Q0, Q1
   166  	mov	%rax, Q0
   167  	mov	B2, %rax
   168  	lea	(B2md, U0), T
   169  	adc	$0, Q2
   170  
   171  	C {U2, U1, U0} <-- (U0 + U2 B2 -c U) B + U1 B2 + u
   172  	mul	U1
   173  	and	B2, U2
   174  	add	U2, U0
   175  	cmovnc	U0, T
   176  
   177  	C {QP+UN, ...} <-- {QP+UN, ...} + {Q2, Q1} + U1 + c
   178  	adc	U1, Q1
   179  	mov	-8(UP, UN, 8), U0
   180  	adc	Q2, 8(QP, UN, 8)
   181  	jc	L(q_incr)
   182  L(q_incr_done):
   183  	add	%rax, U0
   184  	mov	T, %rax
   185  	adc	%rdx, %rax
   186  	mov	Q1, (QP, UN, 8)
   187  	mov	$0, R32(Q1)
   188  	sbb	U2, U2
   189  	dec	UN
   190  	mov	%rax, U1
   191  	jnz	L(loop)
   192  
   193  L(final):
   194  	pop	D
   195  
   196  	mov	U2, Q1
   197  	and	D, U2
   198  	sub	U2, %rax
   199  	neg	Q1
   200  
   201  	mov	%rax, U1
   202  	sub	D, %rax
   203  	cmovc	U1, %rax
   204  	sbb	$-1, Q1
   205  
   206  	lea	1(%rax), T
   207  	mul	DINV
   208  	add	U0, %rax
   209  	adc	T, %rdx
   210  	mov	%rdx, T
   211  	imul	D, %rdx
   212  	sub	%rdx, U0
   213  	cmp	U0, %rax
   214  	lea	(U0, D), %rax
   215  	cmovnc	U0, %rax
   216  	sbb	$0, T
   217  	cmp	D, %rax
   218  	jc	L(div_done)
   219  	sub	D, %rax
   220  	add	$1, T
   221  L(div_done):
   222  	add	T, Q0
   223  	mov	Q0, (QP)
   224  	adc	Q1, 8(QP)
   225  	jnc	L(done)
   226  L(final_q_incr):
   227  	addq	$1, 16(QP)
   228  	lea	8(QP), QP
   229  	jc	L(final_q_incr)
   230  
   231  L(done):
   232  	pop	%rbp
   233  	pop	%rbx
   234  	pop	%r12
   235  	pop	%r13
   236  	pop	%r14
   237  	pop	%r15
   238  	FUNC_EXIT
   239  	ret
   240  
   241  L(q_incr):
   242  	C U1 is not live, so use it for indexing
   243  	lea	16(QP, UN, 8), U1
   244  L(q_incr_loop):
   245  	addq	$1, (U1)
   246  	jnc	L(q_incr_done)
   247  	lea	8(U1), U1
   248  	jmp	L(q_incr_loop)
   249  EPILOGUE()