github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/div_qr_1n_pi1.asm

github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/div_qr_1n_pi1.asm (about)

     1  dnl  x86-64 mpn_div_qr_1n_pi1
     2  dnl  -- Divide an mpn number by a normalized single-limb number,
     3  dnl     using a single-limb inverse.
     4  
     5  dnl  Contributed to the GNU project by Niels Möller
     6  
     7  dnl  Copyright 2013 Free Software Foundation, Inc.
     8  
     9  dnl  This file is part of the GNU MP Library.
    10  dnl
    11  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    12  dnl  it under the terms of either:
    13  dnl
    14  dnl    * the GNU Lesser General Public License as published by the Free
    15  dnl      Software Foundation; either version 3 of the License, or (at your
    16  dnl      option) any later version.
    17  dnl
    18  dnl  or
    19  dnl
    20  dnl    * the GNU General Public License as published by the Free Software
    21  dnl      Foundation; either version 2 of the License, or (at your option) any
    22  dnl      later version.
    23  dnl
    24  dnl  or both in parallel, as here.
    25  dnl
    26  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    27  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    28  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    29  dnl  for more details.
    30  dnl
    31  dnl  You should have received copies of the GNU General Public License and the
    32  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    33  dnl  see https://www.gnu.org/licenses/.
    34  
    35  include(`../config.m4')
    36  
    37  
    38  C		c/l
    39  C AMD K8,K9	13
    40  C AMD K10	13
    41  C AMD bull	16.5
    42  C AMD pile	15
    43  C AMD steam	 ?
    44  C AMD bobcat	16
    45  C AMD jaguar	 ?
    46  C Intel P4	47	poor
    47  C Intel core	19.25
    48  C Intel NHM	18
    49  C Intel SBR	15	poor
    50  C Intel IBR	13
    51  C Intel HWL	11.7
    52  C Intel BWL	 ?
    53  C Intel atom	52	very poor
    54  C VIA nano	19
    55  
    56  
    57  C INPUT Parameters
    58  define(`QP', `%rdi')
    59  define(`UP', `%rsi')
    60  define(`UN_INPUT', `%rdx')
    61  define(`U1', `%rcx')	C Also in %rax
    62  define(`D', `%r8')
    63  define(`DINV', `%r9')
    64  
    65  C Invariants
    66  define(`B2', `%rbp')
    67  define(`B2md', `%rbx')
    68  
    69  C Variables
    70  define(`UN', `%r8')	C Overlaps D input
    71  define(`T', `%r10')
    72  define(`U0', `%r11')
    73  define(`U2', `%r12')
    74  define(`Q0', `%r13')
    75  define(`Q1', `%r14')
    76  define(`Q2', `%r15')
    77  
    78  ABI_SUPPORT(STD64)
    79  
    80  	ASM_START()
    81  	TEXT
    82  	ALIGN(16)
    83  PROLOGUE(mpn_div_qr_1n_pi1)
    84  	FUNC_ENTRY(6)
    85  IFDOS(`	mov	56(%rsp), %r8	')
    86  IFDOS(`	mov	64(%rsp), %r9	')
    87  	dec	UN_INPUT
    88  	jnz	L(first)
    89  
    90  	C Just a single 2/1 division.
    91  	C T, U0 are allocated in scratch registers
    92  	lea	1(U1), T
    93  	mov	U1, %rax
    94  	mul	DINV
    95  	mov	(UP), U0
    96  	add	U0, %rax
    97  	adc	T, %rdx
    98  	mov	%rdx, T
    99  	imul	D, %rdx
   100  	sub	%rdx, U0
   101  	cmp	U0, %rax
   102  	lea	(U0, D), %rax
   103  	cmovnc	U0, %rax
   104  	sbb	$0, T
   105  	cmp	D, %rax
   106  	jc	L(single_div_done)
   107  	sub	D, %rax
   108  	add	$1, T
   109  L(single_div_done):
   110  	mov	T, (QP)
   111  	FUNC_EXIT
   112  	ret
   113  L(first):
   114  	C FIXME: Could delay some of these until we enter the loop.
   115  	push	%r15
   116  	push	%r14
   117  	push	%r13
   118  	push	%r12
   119  	push	%rbx
   120  	push	%rbp
   121  
   122  	mov	D, B2
   123  	imul	DINV, B2
   124  	neg	B2
   125  	mov	B2, B2md
   126  	sub	D, B2md
   127  
   128  	C D not needed until final reduction
   129  	push	D
   130  	mov	UN_INPUT, UN	C Clobbers D
   131  
   132  	mov	DINV, %rax
   133  	mul	U1
   134  	mov	%rax, Q0
   135  	add	U1, %rdx
   136  	mov	%rdx, T
   137  
   138  	mov	B2, %rax
   139  	mul	U1
   140  	mov	-8(UP, UN, 8), U0
   141  	mov	(UP, UN, 8), U1
   142  	mov	T, (QP, UN, 8)
   143  	add	%rax, U0
   144  	adc	%rdx, U1
   145  	sbb	U2, U2
   146  	dec	UN
   147  	mov	U1, %rax
   148  	jz	L(final)
   149  
   150  	ALIGN(16)
   151  
   152  	C Loop is 28 instructions, 30 decoder slots, should run in 10 cycles.
   153  	C At entry, %rax holds an extra copy of U1
   154  L(loop):
   155  	C {Q2, Q1, Q0} <-- DINV * U1 + B (Q0 + U2 DINV) + B^2 U2
   156  	C Remains to add in B (U1 + c)
   157  	mov	DINV, Q1
   158  	mov	U2, Q2
   159  	and	U2, Q1
   160  	neg	Q2
   161  	mul	DINV
   162  	add	%rdx, Q1
   163  	adc	$0, Q2
   164  	add	Q0, Q1
   165  	mov	%rax, Q0
   166  	mov	B2, %rax
   167  	lea	(B2md, U0), T
   168  	adc	$0, Q2
   169  
   170  	C {U2, U1, U0} <-- (U0 + U2 B2 -c U) B + U1 B2 + u
   171  	mul	U1
   172  	and	B2, U2
   173  	add	U2, U0
   174  	cmovnc	U0, T
   175  
   176  	C {QP+UN, ...} <-- {QP+UN, ...} + {Q2, Q1} + U1 + c
   177  	adc	U1, Q1
   178  	mov	-8(UP, UN, 8), U0
   179  	adc	Q2, 8(QP, UN, 8)
   180  	jc	L(q_incr)
   181  L(q_incr_done):
   182  	add	%rax, U0
   183  	mov	T, %rax
   184  	adc	%rdx, %rax
   185  	mov	Q1, (QP, UN, 8)
   186  	sbb	U2, U2
   187  	dec	UN
   188  	mov	%rax, U1
   189  	jnz	L(loop)
   190  
   191  L(final):
   192  	pop	D
   193  
   194  	mov	U2, Q1
   195  	and	D, U2
   196  	sub	U2, %rax
   197  	neg	Q1
   198  
   199  	mov	%rax, U1
   200  	sub	D, %rax
   201  	cmovc	U1, %rax
   202  	sbb	$-1, Q1
   203  
   204  	lea	1(%rax), T
   205  	mul	DINV
   206  	add	U0, %rax
   207  	adc	T, %rdx
   208  	mov	%rdx, T
   209  	imul	D, %rdx
   210  	sub	%rdx, U0
   211  	cmp	U0, %rax
   212  	lea	(U0, D), %rax
   213  	cmovnc	U0, %rax
   214  	sbb	$0, T
   215  	cmp	D, %rax
   216  	jc	L(div_done)
   217  	sub	D, %rax
   218  	add	$1, T
   219  L(div_done):
   220  	add	T, Q0
   221  	mov	Q0, (QP)
   222  	adc	Q1, 8(QP)
   223  	jnc	L(done)
   224  L(final_q_incr):
   225  	addq	$1, 16(QP)
   226  	lea	8(QP), QP
   227  	jc	L(final_q_incr)
   228  
   229  L(done):
   230  	pop	%rbp
   231  	pop	%rbx
   232  	pop	%r12
   233  	pop	%r13
   234  	pop	%r14
   235  	pop	%r15
   236  	FUNC_EXIT
   237  	ret
   238  
   239  L(q_incr):
   240  	C U1 is not live, so use it for indexing
   241  	lea	16(QP, UN, 8), U1
   242  L(q_incr_loop):
   243  	addq	$1, (U1)
   244  	jnc	L(q_incr_done)
   245  	lea	8(U1), U1
   246  	jmp	L(q_incr_loop)
   247  EPILOGUE()