github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/mul_2.asm (about)

     1  dnl  AMD64 mpn_mul_2 -- Multiply an n-limb vector with a 2-limb vector and
     2  dnl  store the result in a third limb vector.
     3  
     4  dnl  Copyright 2008, 2011, 2012 Free Software Foundation, Inc.
     5  
     6  dnl  This file is part of the GNU MP Library.
     7  dnl
     8  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     9  dnl  it under the terms of either:
    10  dnl
    11  dnl    * the GNU Lesser General Public License as published by the Free
    12  dnl      Software Foundation; either version 3 of the License, or (at your
    13  dnl      option) any later version.
    14  dnl
    15  dnl  or
    16  dnl
    17  dnl    * the GNU General Public License as published by the Free Software
    18  dnl      Foundation; either version 2 of the License, or (at your option) any
    19  dnl      later version.
    20  dnl
    21  dnl  or both in parallel, as here.
    22  dnl
    23  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    24  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    25  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    26  dnl  for more details.
    27  dnl
    28  dnl  You should have received copies of the GNU General Public License and the
    29  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    30  dnl  see https://www.gnu.org/licenses/.
    31  
    32  include(`../config.m4')
    33  
    34  C	     cycles/limb
    35  C AMD K8,K9	 2.275
    36  C AMD K10	 2.275
    37  C Intel P4	13.5
    38  C Intel core2	 4.0
    39  C Intel corei	 3.8
    40  C Intel atom	 ?
    41  C VIA nano	 ?
    42  
    43  C This code is the result of running a code generation and optimization tool
    44  C suite written by David Harvey and Torbjorn Granlund.
    45  
    46  C TODO
    47  C  * Work on feed-in and wind-down code.
    48  C  * Convert "mov $0" to "xor".
    49  C  * Adjust initial lea to save some bytes.
    50  C  * Perhaps adjust n from n_param&3 value?
    51  C  * Replace with 2.25 c/l sequence.
    52  
    53  C INPUT PARAMETERS
    54  define(`rp',	 `%rdi')
    55  define(`up',	 `%rsi')
    56  define(`n_param',`%rdx')
    57  define(`vp',	 `%rcx')
    58  
    59  define(`v0', `%r8')
    60  define(`v1', `%r9')
    61  define(`w0', `%rbx')
    62  define(`w1', `%rcx')
    63  define(`w2', `%rbp')
    64  define(`w3', `%r10')
    65  define(`n',  `%r11')
    66  
    67  ABI_SUPPORT(DOS64)
    68  ABI_SUPPORT(STD64)
    69  
    70  ASM_START()
    71  	TEXT
    72  	ALIGN(16)
    73  PROLOGUE(mpn_mul_2)
    74  	FUNC_ENTRY(4)
    75  	push	%rbx
    76  	push	%rbp
    77  
    78  	mov	(vp), v0
    79  	mov	8(vp), v1
    80  
    81  	mov	(up), %rax
    82  
    83  	mov	n_param, n
    84  	neg	n
    85  	lea	-8(up,n_param,8), up
    86  	lea	-8(rp,n_param,8), rp
    87  
    88  	and	$3, R32(n_param)
    89  	jz	L(m2p0)
    90  	cmp	$2, R32(n_param)
    91  	jc	L(m2p1)
    92  	jz	L(m2p2)
    93  L(m2p3):
    94  	mul	v0
    95  	xor	R32(w3), R32(w3)
    96  	mov	%rax, w1
    97  	mov	%rdx, w2
    98  	mov	8(up,n,8), %rax
    99  	add	$-1, n
   100  	mul	v1
   101  	add	%rax, w2
   102  	jmp	L(m23)
   103  L(m2p0):
   104  	mul	v0
   105  	xor	R32(w2), R32(w2)
   106  	mov	%rax, w0
   107  	mov	%rdx, w1
   108  	jmp	L(m20)
   109  L(m2p1):
   110  	mul	v0
   111  	xor	R32(w3), R32(w3)
   112  	xor	R32(w0), R32(w0)
   113  	xor	R32(w1), R32(w1)
   114  	add	$1, n
   115  	jmp	L(m2top)
   116  L(m2p2):
   117  	mul	v0
   118  	xor	R32(w0), R32(w0)
   119  	xor	R32(w1), R32(w1)
   120  	mov	%rax, w2
   121  	mov	%rdx, w3
   122  	mov	8(up,n,8), %rax
   123  	add	$-2, n
   124  	jmp	L(m22)
   125  
   126  
   127  	ALIGN(32)
   128  L(m2top):
   129  	add	%rax, w3
   130  	adc	%rdx, w0
   131  	mov	0(up,n,8), %rax
   132  	adc	$0, R32(w1)
   133  	mov	$0, R32(w2)
   134  	mul	v1
   135  	add	%rax, w0
   136  	mov	w3, 0(rp,n,8)
   137  	adc	%rdx, w1
   138  	mov	8(up,n,8), %rax
   139  	mul	v0
   140  	add	%rax, w0
   141  	adc	%rdx, w1
   142  	adc	$0, R32(w2)
   143  L(m20):	mov	8(up,n,8), %rax
   144  	mul	v1
   145  	add	%rax, w1
   146  	adc	%rdx, w2
   147  	mov	16(up,n,8), %rax
   148  	mov	$0, R32(w3)
   149  	mul	v0
   150  	add	%rax, w1
   151  	mov	16(up,n,8), %rax
   152  	adc	%rdx, w2
   153  	adc	$0, R32(w3)
   154  	mul	v1
   155  	add	%rax, w2
   156  	mov	w0, 8(rp,n,8)
   157  L(m23):	adc	%rdx, w3
   158  	mov	24(up,n,8), %rax
   159  	mul	v0
   160  	mov	$0, R32(w0)
   161  	add	%rax, w2
   162  	adc	%rdx, w3
   163  	mov	w1, 16(rp,n,8)
   164  	mov	24(up,n,8), %rax
   165  	mov	$0, R32(w1)
   166  	adc	$0, R32(w0)
   167  L(m22):	mul	v1
   168  	add	%rax, w3
   169  	mov	w2, 24(rp,n,8)
   170  	adc	%rdx, w0
   171  	mov	32(up,n,8), %rax
   172  	mul	v0
   173  	add	$4, n
   174  	js	L(m2top)
   175  
   176  
   177  	add	%rax, w3
   178  	adc	%rdx, w0
   179  	adc	$0, R32(w1)
   180  	mov	(up), %rax
   181  	mul	v1
   182  	mov	w3, (rp)
   183  	add	%rax, w0
   184  	adc	%rdx, w1
   185  	mov	w0, 8(rp)
   186  	mov	w1, %rax
   187  
   188  	pop	%rbp
   189  	pop	%rbx
   190  	FUNC_EXIT()
   191  	ret
   192  EPILOGUE()