github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/addmul_2.asm (about)

     1  dnl  AMD64 mpn_addmul_2 -- Multiply an n-limb vector with a 2-limb vector and
     2  dnl  add the result to a third limb vector.
     3  
     4  dnl  Copyright 2008, 2011, 2012 Free Software Foundation, Inc.
     5  
     6  dnl  This file is part of the GNU MP Library.
     7  dnl
     8  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     9  dnl  it under the terms of either:
    10  dnl
    11  dnl    * the GNU Lesser General Public License as published by the Free
    12  dnl      Software Foundation; either version 3 of the License, or (at your
    13  dnl      option) any later version.
    14  dnl
    15  dnl  or
    16  dnl
    17  dnl    * the GNU General Public License as published by the Free Software
    18  dnl      Foundation; either version 2 of the License, or (at your option) any
    19  dnl      later version.
    20  dnl
    21  dnl  or both in parallel, as here.
    22  dnl
    23  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    24  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    25  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    26  dnl  for more details.
    27  dnl
    28  dnl  You should have received copies of the GNU General Public License and the
    29  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    30  dnl  see https://www.gnu.org/licenses/.
    31  
    32  include(`../config.m4')
    33  
    34  C	     cycles/limb
    35  C AMD K8,K9	 2.375
    36  C AMD K10	 2.375
    37  C Intel P4	15-16
    38  C Intel core2	 4.45
    39  C Intel NHM	 4.32
    40  C Intel SBR	 3.4
    41  C Intel atom	 ?
    42  C VIA nano	 4.4
    43  
    44  C This code is the result of running a code generation and optimization tool
    45  C suite written by David Harvey and Torbjorn Granlund.
    46  
    47  C TODO
    48  C  * Tune feed-in and wind-down code.
    49  
    50  C INPUT PARAMETERS
    51  define(`rp',     `%rdi')
    52  define(`up',     `%rsi')
    53  define(`n_param',`%rdx')
    54  define(`vp',     `%rcx')
    55  
    56  define(`v0', `%r8')
    57  define(`v1', `%r9')
    58  define(`w0', `%rbx')
    59  define(`w1', `%rcx')
    60  define(`w2', `%rbp')
    61  define(`w3', `%r10')
    62  define(`n',  `%r11')
    63  
    64  ABI_SUPPORT(DOS64)
    65  ABI_SUPPORT(STD64)
    66  
    67  ASM_START()
    68  	TEXT
    69  	ALIGN(16)
    70  PROLOGUE(mpn_addmul_2)
    71  	FUNC_ENTRY(4)
    72  	mov	n_param, n
    73  	push	%rbx
    74  	push	%rbp
    75  
    76  	mov	0(vp), v0
    77  	mov	8(vp), v1
    78  
    79  	mov	R32(n_param), R32(%rbx)
    80  	mov	(up), %rax
    81  	lea	-8(up,n_param,8), up
    82  	lea	-8(rp,n_param,8), rp
    83  	mul	v0
    84  	neg	n
    85  	and	$3, R32(%rbx)
    86  	jz	L(b0)
    87  	cmp	$2, R32(%rbx)
    88  	jc	L(b1)
    89  	jz	L(b2)
    90  
    91  L(b3):	mov	%rax, w1
    92  	mov	%rdx, w2
    93  	xor	R32(w3), R32(w3)
    94  	mov	8(up,n,8), %rax
    95  	dec	n
    96  	jmp	L(lo3)
    97  
    98  L(b2):	mov	%rax, w2
    99  	mov	8(up,n,8), %rax
   100  	mov	%rdx, w3
   101  	xor	R32(w0), R32(w0)
   102  	add	$-2, n
   103  	jmp	L(lo2)
   104  
   105  L(b1):	mov	%rax, w3
   106  	mov	8(up,n,8), %rax
   107  	mov	%rdx, w0
   108  	xor	R32(w1), R32(w1)
   109  	inc	n
   110  	jmp	L(lo1)
   111  
   112  L(b0):	mov	$0, R32(w3)
   113  	mov	%rax, w0
   114  	mov	8(up,n,8), %rax
   115  	mov	%rdx, w1
   116  	xor	R32(w2), R32(w2)
   117  	jmp	L(lo0)
   118  
   119  	ALIGN(32)
   120  L(top):	mov	$0, R32(w1)
   121  	mul	v0
   122  	add	%rax, w3
   123  	mov	(up,n,8), %rax
   124  	adc	%rdx, w0
   125  	adc	$0, R32(w1)
   126  L(lo1):	mul	v1
   127  	add	w3, (rp,n,8)
   128  	mov	$0, R32(w3)
   129  	adc	%rax, w0
   130  	mov	$0, R32(w2)
   131  	mov	8(up,n,8), %rax
   132  	adc	%rdx, w1
   133  	mul	v0
   134  	add	%rax, w0
   135  	mov	8(up,n,8), %rax
   136  	adc	%rdx, w1
   137  	adc	$0, R32(w2)
   138  L(lo0):	mul	v1
   139  	add	w0, 8(rp,n,8)
   140  	adc	%rax, w1
   141  	adc	%rdx, w2
   142  	mov	16(up,n,8), %rax
   143  	mul	v0
   144  	add	%rax, w1
   145  	adc	%rdx, w2
   146  	adc	$0, R32(w3)
   147  	mov	16(up,n,8), %rax
   148  L(lo3):	mul	v1
   149  	add	w1, 16(rp,n,8)
   150  	adc	%rax, w2
   151  	adc	%rdx, w3
   152  	xor	R32(w0), R32(w0)
   153  	mov	24(up,n,8), %rax
   154  	mul	v0
   155  	add	%rax, w2
   156  	mov	24(up,n,8), %rax
   157  	adc	%rdx, w3
   158  	adc	$0, R32(w0)
   159  L(lo2):	mul	v1
   160  	add	w2, 24(rp,n,8)
   161  	adc	%rax, w3
   162  	adc	%rdx, w0
   163  	mov	32(up,n,8), %rax
   164  	add	$4, n
   165  	js	L(top)
   166  
   167  L(end):	xor	R32(w1), R32(w1)
   168  	mul	v0
   169  	add	%rax, w3
   170  	mov	(up), %rax
   171  	adc	%rdx, w0
   172  	adc	R32(w1), R32(w1)
   173  	mul	v1
   174  	add	w3, (rp)
   175  	adc	%rax, w0
   176  	adc	%rdx, w1
   177  	mov	w0, 8(rp)
   178  	mov	w1, %rax
   179  
   180  	pop	%rbp
   181  	pop	%rbx
   182  	FUNC_EXIT()
   183  	ret
   184  EPILOGUE()