github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/mul_1.asm (about)

     1  dnl  AMD64 mpn_mul_1.
     2  
     3  dnl  Copyright 2003-2005, 2007, 2008, 2012 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  C	     cycles/limb
    34  C AMD K8,K9	 2.5
    35  C AMD K10	 2.5
    36  C AMD bd1	 5.0
    37  C AMD bobcat	 5.5
    38  C Intel P4	12.3
    39  C Intel core2	 4.0
    40  C Intel NHM	 3.75
    41  C Intel SBR	 2.95
    42  C Intel atom	19.8
    43  C VIA nano	 4.25
    44  
    45  C The loop of this code is the result of running a code generation and
    46  C optimization tool suite written by David Harvey and Torbjorn Granlund.
    47  
    48  C TODO
    49  C  * The loop is great, but the prologue and epilogue code was quickly written.
    50  C    Tune it!
    51  
    52  define(`rp',      `%rdi')   C rcx
    53  define(`up',      `%rsi')   C rdx
    54  define(`n_param', `%rdx')   C r8
    55  define(`vl',      `%rcx')   C r9
    56  
    57  define(`n',       `%r11')
    58  
    59  ABI_SUPPORT(DOS64)
    60  ABI_SUPPORT(STD64)
    61  
    62  IFDOS(`	define(`up', ``%rsi'')	') dnl
    63  IFDOS(`	define(`rp', ``%rcx'')	') dnl
    64  IFDOS(`	define(`vl', ``%r9'')	') dnl
    65  IFDOS(`	define(`r9', ``rdi'')	') dnl
    66  IFDOS(`	define(`n',  ``%r8'')	') dnl
    67  IFDOS(`	define(`r8', ``r11'')	') dnl
    68  
    69  ASM_START()
    70  	TEXT
    71  	ALIGN(16)
    72  PROLOGUE(mpn_mul_1c)
    73  IFDOS(``push	%rsi		'')
    74  IFDOS(``push	%rdi		'')
    75  IFDOS(``mov	%rdx, %rsi	'')
    76  	push	%rbx
    77  IFSTD(`	mov	%r8, %r10')
    78  IFDOS(`	mov	64(%rsp), %r10')	C 40 + 3*8  (3 push insns)
    79  	jmp	L(common)
    80  EPILOGUE()
    81  
    82  PROLOGUE(mpn_mul_1)
    83  IFDOS(``push	%rsi		'')
    84  IFDOS(``push	%rdi		'')
    85  IFDOS(``mov	%rdx, %rsi	'')
    86  
    87  	push	%rbx
    88  	xor	%r10, %r10
    89  L(common):
    90  	mov	(up), %rax		C read first u limb early
    91  IFSTD(`	mov	n_param, %rbx   ')	C move away n from rdx, mul uses it
    92  IFDOS(`	mov	n, %rbx         ')
    93  	mul	vl
    94  IFSTD(`	mov	%rbx, n         ')
    95  
    96  	add	%r10, %rax
    97  	adc	$0, %rdx
    98  
    99  	and	$3, R32(%rbx)
   100  	jz	L(b0)
   101  	cmp	$2, R32(%rbx)
   102  	jz	L(b2)
   103  	jg	L(b3)
   104  
   105  L(b1):	dec	n
   106  	jne	L(gt1)
   107  	mov	%rax, (rp)
   108  	jmp	L(ret)
   109  L(gt1):	lea	8(up,n,8), up
   110  	lea	-8(rp,n,8), rp
   111  	neg	n
   112  	xor	%r10, %r10
   113  	xor	R32(%rbx), R32(%rbx)
   114  	mov	%rax, %r9
   115  	mov	(up,n,8), %rax
   116  	mov	%rdx, %r8
   117  	jmp	L(L1)
   118  
   119  L(b0):	lea	(up,n,8), up
   120  	lea	-16(rp,n,8), rp
   121  	neg	n
   122  	xor	%r10, %r10
   123  	mov	%rax, %r8
   124  	mov	%rdx, %rbx
   125  	jmp	 L(L0)
   126  
   127  L(b3):	lea	-8(up,n,8), up
   128  	lea	-24(rp,n,8), rp
   129  	neg	n
   130  	mov	%rax, %rbx
   131  	mov	%rdx, %r10
   132  	jmp	L(L3)
   133  
   134  L(b2):	lea	-16(up,n,8), up
   135  	lea	-32(rp,n,8), rp
   136  	neg	n
   137  	xor	%r8, %r8
   138  	xor	R32(%rbx), R32(%rbx)
   139  	mov	%rax, %r10
   140  	mov	24(up,n,8), %rax
   141  	mov	%rdx, %r9
   142  	jmp	L(L2)
   143  
   144  	ALIGN(16)
   145  L(top):	mov	%r10, (rp,n,8)
   146  	add	%rax, %r9
   147  	mov	(up,n,8), %rax
   148  	adc	%rdx, %r8
   149  	mov	$0, R32(%r10)
   150  L(L1):	mul	vl
   151  	mov	%r9, 8(rp,n,8)
   152  	add	%rax, %r8
   153  	adc	%rdx, %rbx
   154  L(L0):	mov	8(up,n,8), %rax
   155  	mul	vl
   156  	mov	%r8, 16(rp,n,8)
   157  	add	%rax, %rbx
   158  	adc	%rdx, %r10
   159  L(L3):	mov	16(up,n,8), %rax
   160  	mul	vl
   161  	mov	%rbx, 24(rp,n,8)
   162  	mov	$0, R32(%r8)		C zero
   163  	mov	%r8, %rbx		C zero
   164  	add	%rax, %r10
   165  	mov	24(up,n,8), %rax
   166  	mov	%r8, %r9		C zero
   167  	adc	%rdx, %r9
   168  L(L2):	mul	vl
   169  	add	$4, n
   170  	js	 L(top)
   171  
   172  	mov	%r10, (rp,n,8)
   173  	add	%rax, %r9
   174  	adc	%r8, %rdx
   175  	mov	%r9, 8(rp,n,8)
   176  	add	%r8, %rdx
   177  L(ret):	mov	%rdx, %rax
   178  
   179  	pop	%rbx
   180  IFDOS(``pop	%rdi		'')
   181  IFDOS(``pop	%rsi		'')
   182  	ret
   183  EPILOGUE()