github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/bd1/aorsmul_1.asm (about)

     1  dnl  AMD64 mpn_addmul_1 and mpn_submul_1 optimised for AMD Bulldozer.
     2  
     3  dnl  Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  C	     cycles/limb
    34  C AMD K8,K9
    35  C AMD K10
    36  C AMD bd1	 4.5-4.7
    37  C AMD bobcat
    38  C Intel P4
    39  C Intel core2
    40  C Intel NHM
    41  C Intel SBR
    42  C Intel atom
    43  C VIA nano
    44  
    45  C The loop of this code is the result of running a code generation and
    46  C optimisation tool suite written by David Harvey and Torbjorn Granlund.
    47  
    48  C TODO
    49  C  * Try to make loop run closer to 4 c/l.
    50  
    51  define(`rp',      `%rdi')   C rcx
    52  define(`up',      `%rsi')   C rdx
    53  define(`n_param', `%rdx')   C r8
    54  define(`v0',      `%rcx')   C r9
    55  
    56  define(`n',       `%r11')
    57  
    58  ifdef(`OPERATION_addmul_1',`
    59        define(`ADDSUB',        `add')
    60        define(`func',  `mpn_addmul_1')
    61  ')
    62  ifdef(`OPERATION_submul_1',`
    63        define(`ADDSUB',        `sub')
    64        define(`func',  `mpn_submul_1')
    65  ')
    66  
    67  ABI_SUPPORT(DOS64)
    68  ABI_SUPPORT(STD64)
    69  
    70  MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
    71  
    72  IFDOS(`	define(`up', ``%rsi'')	') dnl
    73  IFDOS(`	define(`rp', ``%rcx'')	') dnl
    74  IFDOS(`	define(`v0', ``%r9'')	') dnl
    75  IFDOS(`	define(`r9', ``rdi'')	') dnl
    76  IFDOS(`	define(`n',  ``%r8'')	') dnl
    77  IFDOS(`	define(`r8', ``r11'')	') dnl
    78  
    79  ASM_START()
    80  	TEXT
    81  	ALIGN(16)
    82  PROLOGUE(func)
    83  IFDOS(``push	%rsi		'')
    84  IFDOS(``push	%rdi		'')
    85  IFDOS(``mov	%rdx, %rsi	'')
    86  
    87  	mov	(up), %rax		C read first u limb early
    88  	push	%rbx
    89  IFSTD(`	mov	n_param, %rbx	')	C move away n from rdx, mul uses it
    90  IFDOS(`	mov	n, %rbx		')
    91  	mul	v0
    92  
    93  IFSTD(`	mov	%rbx, n		')
    94  
    95  	and	$3, R32(%rbx)
    96  	lea	-16(rp,n,8), rp
    97  	jz	L(b0)
    98  	cmp	$2, R32(%rbx)
    99  	jb	L(b1)
   100  	jz	L(b2)
   101  
   102  L(b3):	mov	$0, R32(%r8)
   103  	mov	%rax, %rbx
   104  	mov	$0, R32(%r9)
   105  	mov	8(up), %rax
   106  	mov	%rdx, %r10
   107  	lea	(up,n,8), up
   108  	not	n
   109  	jmp	L(L3)
   110  
   111  L(b0):	mov	$0, R32(%r10)
   112  	mov	%rax, %r8
   113  	mov	%rdx, %rbx
   114  	mov	8(up), %rax
   115  	lea	(up,n,8), up
   116  	neg	n
   117  	jmp	L(L0)
   118  
   119  L(b1):	cmp	$1, n
   120  	jz	L(n1)
   121  	mov	%rax, %r9
   122  	mov	8(up), %rax
   123  	mov	%rdx, %r8
   124  	mov	$0, R32(%rbx)
   125  	lea	(up,n,8), up
   126  	neg	n
   127  	inc	n
   128  	jmp	L(L1)
   129  
   130  L(b2):	mov	$0, R32(%rbx)
   131  	mov	%rax, %r10
   132  	mov	%rdx, %r9
   133  	mov	8(up), %rax
   134  	mov	$0, R32(%r8)
   135  	lea	(up,n,8), up
   136  	neg	n
   137  	add	$2, n
   138  	jns	L(end)
   139  
   140  	ALIGN(32)
   141  L(top):	mul	v0
   142  	ADDSUB	%r10, (rp,n,8)
   143  	adc	%rax, %r9
   144  	mov	(up,n,8), %rax
   145  	adc	%rdx, %r8
   146  L(L1):	mul	v0
   147  	mov	$0, R32(%r10)
   148  	ADDSUB	%r9, 8(rp,n,8)
   149  	adc	%rax, %r8
   150  	adc	%rdx, %rbx
   151  	mov	8(up,n,8), %rax
   152  L(L0):	mul	v0
   153  	ADDSUB	%r8, 16(rp,n,8)
   154  	mov	$0, R32(%r8)
   155  	adc	%rax, %rbx
   156  	mov	$0, R32(%r9)
   157  	mov	16(up,n,8), %rax
   158  	adc	%rdx, %r10
   159  L(L3):	mul	v0
   160  	ADDSUB	%rbx, 24(rp,n,8)
   161  	mov	$0, R32(%rbx)
   162  	adc	%rax, %r10
   163  	adc	%rdx, %r9
   164  	mov	24(up,n,8), %rax
   165  	add	$4, n
   166  	js	L(top)
   167  
   168  L(end):	mul	v0
   169  	ADDSUB	%r10, (rp)
   170  	adc	%r9, %rax
   171  	adc	%r8, %rdx
   172  L(n1):	ADDSUB	%rax, 8(rp)
   173  	adc	$0, %rdx
   174  	mov	%rdx, %rax
   175  
   176  	pop	%rbx
   177  IFDOS(``pop	%rdi		'')
   178  IFDOS(``pop	%rsi		'')
   179  	ret
   180  EPILOGUE()
   181  ASM_END()