github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/cnd_aors_n.asm (about)

     1  dnl  AMD64 mpn_cnd_add_n, mpn_cnd_sub_n
     2  
     3  dnl  Copyright 2011-2013 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  C	     cycles/limb
    34  C AMD K8,K9	 2
    35  C AMD K10	 2
    36  C AMD bd1	 2.32
    37  C AMD bobcat	 3
    38  C Intel P4	13
    39  C Intel core2	 2.9
    40  C Intel NHM	 2.8
    41  C Intel SBR	 2.4
    42  C Intel atom	 5.33
    43  C VIA nano	 3
    44  
    45  C NOTES
    46  C  * It might seem natural to use the cmov insn here, but since this function
    47  C    is supposed to have the exact same execution pattern for cnd true and
    48  C    false, and since cmov's documentation is not clear about whether it
    49  C    actually reads both source operands and writes the register for a false
    50  C    condition, we cannot use it.
    51  C  * Two cases could be optimised: (1) cnd_add_n could use ADCSBB-from-memory
    52  C    to save one insn/limb, and (2) when up=rp cnd_add_n and cnd_sub_n could use
    53  C    ADCSBB-to-memory, again saving 1 insn/limb.
    54  C  * This runs optimally at decoder bandwidth on K10.  It has not been tuned
    55  C    for any other processor.
    56  
    57  C INPUT PARAMETERS
    58  define(`cnd',	`%rdi')	dnl rcx
    59  define(`rp',	`%rsi')	dnl rdx
    60  define(`up',	`%rdx')	dnl r8
    61  define(`vp',	`%rcx')	dnl r9
    62  define(`n',	`%r8')	dnl rsp+40
    63  
    64  ifdef(`OPERATION_cnd_add_n', `
    65  	define(ADDSUB,	      add)
    66  	define(ADCSBB,	      adc)
    67  	define(func,	      mpn_cnd_add_n)')
    68  ifdef(`OPERATION_cnd_sub_n', `
    69  	define(ADDSUB,	      sub)
    70  	define(ADCSBB,	      sbb)
    71  	define(func,	      mpn_cnd_sub_n)')
    72  
    73  MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n)
    74  
    75  ABI_SUPPORT(DOS64)
    76  ABI_SUPPORT(STD64)
    77  
    78  ASM_START()
    79  	TEXT
    80  	ALIGN(16)
    81  PROLOGUE(func)
    82  	FUNC_ENTRY(4)
    83  IFDOS(`	mov	56(%rsp), R32(%r8)')
    84  	push	%rbx
    85  	push	%rbp
    86  	push	%r12
    87  	push	%r13
    88  	push	%r14
    89  
    90  	neg	cnd
    91  	sbb	cnd, cnd		C make cnd mask
    92  
    93  	lea	(vp,n,8), vp
    94  	lea	(up,n,8), up
    95  	lea	(rp,n,8), rp
    96  
    97  	mov	R32(n), R32(%rax)
    98  	neg	n
    99  	and	$3, R32(%rax)
   100  	jz	L(top)			C carry-save reg rax = 0 in this arc
   101  	cmp	$2, R32(%rax)
   102  	jc	L(b1)
   103  	jz	L(b2)
   104  
   105  L(b3):	mov	(vp,n,8), %r12
   106  	mov	8(vp,n,8), %r13
   107  	mov	16(vp,n,8), %r14
   108  	and	cnd, %r12
   109  	mov	(up,n,8), %r10
   110  	and	cnd, %r13
   111  	mov	8(up,n,8), %rbx
   112  	and	cnd, %r14
   113  	mov	16(up,n,8), %rbp
   114  	ADDSUB	%r12, %r10
   115  	mov	%r10, (rp,n,8)
   116  	ADCSBB	%r13, %rbx
   117  	mov	%rbx, 8(rp,n,8)
   118  	ADCSBB	%r14, %rbp
   119  	mov	%rbp, 16(rp,n,8)
   120  	sbb	R32(%rax), R32(%rax)	C save carry
   121  	add	$3, n
   122  	js	L(top)
   123  	jmp	L(end)
   124  
   125  L(b2):	mov	(vp,n,8), %r12
   126  	mov	8(vp,n,8), %r13
   127  	mov	(up,n,8), %r10
   128  	and	cnd, %r12
   129  	mov	8(up,n,8), %rbx
   130  	and	cnd, %r13
   131  	ADDSUB	%r12, %r10
   132  	mov	%r10, (rp,n,8)
   133  	ADCSBB	%r13, %rbx
   134  	mov	%rbx, 8(rp,n,8)
   135  	sbb	R32(%rax), R32(%rax)	C save carry
   136  	add	$2, n
   137  	js	L(top)
   138  	jmp	L(end)
   139  
   140  L(b1):	mov	(vp,n,8), %r12
   141  	mov	(up,n,8), %r10
   142  	and	cnd, %r12
   143  	ADDSUB	%r12, %r10
   144  	mov	%r10, (rp,n,8)
   145  	sbb	R32(%rax), R32(%rax)	C save carry
   146  	add	$1, n
   147  	jns	L(end)
   148  
   149  	ALIGN(16)
   150  L(top):	mov	(vp,n,8), %r12
   151  	mov	8(vp,n,8), %r13
   152  	mov	16(vp,n,8), %r14
   153  	mov	24(vp,n,8), %r11
   154  	and	cnd, %r12
   155  	mov	(up,n,8), %r10
   156  	and	cnd, %r13
   157  	mov	8(up,n,8), %rbx
   158  	and	cnd, %r14
   159  	mov	16(up,n,8), %rbp
   160  	and	cnd, %r11
   161  	mov	24(up,n,8), %r9
   162  	add	R32(%rax), R32(%rax)	C restore carry
   163  	ADCSBB	%r12, %r10
   164  	mov	%r10, (rp,n,8)
   165  	ADCSBB	%r13, %rbx
   166  	mov	%rbx, 8(rp,n,8)
   167  	ADCSBB	%r14, %rbp
   168  	mov	%rbp, 16(rp,n,8)
   169  	ADCSBB	%r11, %r9
   170  	mov	%r9, 24(rp,n,8)
   171  	sbb	R32(%rax), R32(%rax)	C save carry
   172  	add	$4, n
   173  	js	L(top)
   174  
   175  L(end):	neg	R32(%rax)
   176  	pop	%r14
   177  	pop	%r13
   178  	pop	%r12
   179  	pop	%rbp
   180  	pop	%rbx
   181  	FUNC_EXIT()
   182  	ret
   183  EPILOGUE()