github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/aorsmul_1.asm (about)

     1  dnl  AMD64 mpn_addmul_1 and mpn_submul_1.
     2  
     3  dnl  Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  C	     cycles/limb
    34  C AMD K8,K9	 2.5
    35  C AMD K10	 2.5
    36  C AMD bd1	 5.0
    37  C AMD bobcat	 6.17
    38  C Intel P4	14.9
    39  C Intel core2	 5.09
    40  C Intel NHM	 4.9
    41  C Intel SBR	 4.0
    42  C Intel atom	21.3
    43  C VIA nano	 5.0
    44  
    45  C The loop of this code is the result of running a code generation and
    46  C optimization tool suite written by David Harvey and Torbjorn Granlund.
    47  
    48  C TODO
    49  C  * The loop is great, but the prologue and epilogue code was quickly written.
    50  C    Tune it!
    51  
    52  define(`rp',      `%rdi')   C rcx
    53  define(`up',      `%rsi')   C rdx
    54  define(`n_param', `%rdx')   C r8
    55  define(`vl',      `%rcx')   C r9
    56  
    57  define(`n',       `%r11')
    58  
    59  ifdef(`OPERATION_addmul_1',`
    60        define(`ADDSUB',        `add')
    61        define(`func',  `mpn_addmul_1')
    62  ')
    63  ifdef(`OPERATION_submul_1',`
    64        define(`ADDSUB',        `sub')
    65        define(`func',  `mpn_submul_1')
    66  ')
    67  
    68  ABI_SUPPORT(DOS64)
    69  ABI_SUPPORT(STD64)
    70  
    71  MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
    72  
    73  IFDOS(`	define(`up', ``%rsi'')	') dnl
    74  IFDOS(`	define(`rp', ``%rcx'')	') dnl
    75  IFDOS(`	define(`vl', ``%r9'')	') dnl
    76  IFDOS(`	define(`r9', ``rdi'')	') dnl
    77  IFDOS(`	define(`n',  ``%r8'')	') dnl
    78  IFDOS(`	define(`r8', ``r11'')	') dnl
    79  
    80  ASM_START()
    81  	TEXT
    82  	ALIGN(16)
    83  PROLOGUE(func)
    84  
    85  IFDOS(``push	%rsi		'')
    86  IFDOS(``push	%rdi		'')
    87  IFDOS(``mov	%rdx, %rsi	'')
    88  
    89  	mov	(up), %rax		C read first u limb early
    90  	push	%rbx
    91  IFSTD(`	mov	n_param, %rbx   ')	C move away n from rdx, mul uses it
    92  IFDOS(`	mov	n, %rbx         ')
    93  	mul	vl
    94  IFSTD(`	mov	%rbx, n         ')
    95  
    96  	and	$3, R32(%rbx)
    97  	jz	L(b0)
    98  	cmp	$2, R32(%rbx)
    99  	jz	L(b2)
   100  	jg	L(b3)
   101  
   102  L(b1):	dec	n
   103  	jne	L(gt1)
   104  	ADDSUB	%rax, (rp)
   105  	jmp	L(ret)
   106  L(gt1):	lea	8(up,n,8), up
   107  	lea	-8(rp,n,8), rp
   108  	neg	n
   109  	xor	%r10, %r10
   110  	xor	R32(%rbx), R32(%rbx)
   111  	mov	%rax, %r9
   112  	mov	(up,n,8), %rax
   113  	mov	%rdx, %r8
   114  	jmp	L(L1)
   115  
   116  L(b0):	lea	(up,n,8), up
   117  	lea	-16(rp,n,8), rp
   118  	neg	n
   119  	xor	%r10, %r10
   120  	mov	%rax, %r8
   121  	mov	%rdx, %rbx
   122  	jmp	 L(L0)
   123  
   124  L(b3):	lea	-8(up,n,8), up
   125  	lea	-24(rp,n,8), rp
   126  	neg	n
   127  	mov	%rax, %rbx
   128  	mov	%rdx, %r10
   129  	jmp	L(L3)
   130  
   131  L(b2):	lea	-16(up,n,8), up
   132  	lea	-32(rp,n,8), rp
   133  	neg	n
   134  	xor	%r8, %r8
   135  	xor	R32(%rbx), R32(%rbx)
   136  	mov	%rax, %r10
   137  	mov	24(up,n,8), %rax
   138  	mov	%rdx, %r9
   139  	jmp	L(L2)
   140  
   141  	ALIGN(16)
   142  L(top):	ADDSUB	%r10, (rp,n,8)
   143  	adc	%rax, %r9
   144  	mov	(up,n,8), %rax
   145  	adc	%rdx, %r8
   146  	mov	$0, R32(%r10)
   147  L(L1):	mul	vl
   148  	ADDSUB	%r9, 8(rp,n,8)
   149  	adc	%rax, %r8
   150  	adc	%rdx, %rbx
   151  L(L0):	mov	8(up,n,8), %rax
   152  	mul	vl
   153  	ADDSUB	%r8, 16(rp,n,8)
   154  	adc	%rax, %rbx
   155  	adc	%rdx, %r10
   156  L(L3):	mov	16(up,n,8), %rax
   157  	mul	vl
   158  	ADDSUB	%rbx, 24(rp,n,8)
   159  	mov	$0, R32(%r8)		C zero
   160  	mov	%r8, %rbx		C zero
   161  	adc	%rax, %r10
   162  	mov	24(up,n,8), %rax
   163  	mov	%r8, %r9		C zero
   164  	adc	%rdx, %r9
   165  L(L2):	mul	vl
   166  	add	$4, n
   167  	js	 L(top)
   168  
   169  	ADDSUB	%r10, (rp,n,8)
   170  	adc	%rax, %r9
   171  	adc	%r8, %rdx
   172  	ADDSUB	%r9, 8(rp,n,8)
   173  L(ret):	adc	$0, %rdx
   174  	mov	%rdx, %rax
   175  
   176  	pop	%rbx
   177  IFDOS(``pop	%rdi		'')
   178  IFDOS(``pop	%rsi		'')
   179  	ret
   180  EPILOGUE()