github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/atom/addmul_2.asm (about)

     1  dnl  AMD64 mpn_addmul_2 optimised for Intel Atom.
     2  
     3  dnl  Copyright 2008, 2011-2013 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  C	     cycles/limb	best
    34  C AMD K8,K9
    35  C AMD K10
    36  C AMD bd1
    37  C AMD bd2
    38  C AMD bobcat
    39  C AMD jaguar
    40  C Intel P4
    41  C Intel PNR
    42  C Intel NHM
    43  C Intel SBR
    44  C Intel IBR
    45  C Intel HWL
    46  C Intel BWL
    47  C Intel atom	18.8		this
    48  C VIA nano
    49  
    50  C The loop of this code is the result of running a code generation and
    51  C optimisation tool suite written by David Harvey and Torbjorn Granlund.
    52  
    53  define(`rp',      `%rdi')   C rcx
    54  define(`up',      `%rsi')   C rdx
    55  define(`n_param', `%rdx')   C r8
    56  define(`vp',      `%rcx')   C r9
    57  
    58  define(`v0', `%r8')
    59  define(`v1', `%r9')
    60  define(`w0', `%rbx')
    61  define(`w1', `%rcx')
    62  define(`w2', `%rbp')
    63  define(`w3', `%r10')
    64  define(`n',  `%r11')
    65  
    66  ABI_SUPPORT(DOS64)
    67  ABI_SUPPORT(STD64)
    68  
    69  ASM_START()
    70  	TEXT
    71  	ALIGN(16)
    72  PROLOGUE(mpn_addmul_2)
    73  	FUNC_ENTRY(4)
    74  	push	%rbx
    75  	push	%rbp
    76  
    77  	mov	(up), %rax
    78  
    79  	mov	(vp), v0
    80  	mov	8(vp), v1
    81  
    82  	mov	n_param, n
    83  	mul	v0
    84  
    85  	test	$1, R8(n)
    86  	jnz	L(bx1)
    87  
    88  L(bx0):	test	$2, R8(n)
    89  	jnz	L(b10)
    90  
    91  L(b00):	mov	%rax, w0
    92  	mov	(up), %rax
    93  	mov	%rdx, w1
    94  	xor	R32(w2), R32(w2)
    95  	lea	-8(rp), rp
    96  	jmp	L(lo0)
    97  
    98  L(b10):	mov	%rax, w2
    99  	mov	(up), %rax
   100  	mov	%rdx, w3
   101  	xor	R32(w0), R32(w0)
   102  	lea	-16(up), up
   103  	lea	-24(rp), rp
   104  	jmp	L(lo2)
   105  
   106  L(bx1):	test	$2, R8(n)
   107  	jnz	L(b11)
   108  
   109  L(b01):	mov	%rax, w3
   110  	mov	%rdx, w0
   111  	mov	(up), %rax
   112  	xor	R32(w1), R32(w1)
   113  	lea	8(up), up
   114  	dec	n
   115  	jmp	L(lo1)
   116  
   117  L(b11):	mov	%rax, w1
   118  	mov	(up), %rax
   119  	mov	%rdx, w2
   120  	xor	R32(w3), R32(w3)
   121  	lea	-8(up), up
   122  	lea	-16(rp), rp
   123  	jmp	L(lo3)
   124  
   125  	ALIGN(16)
   126  L(top):
   127  L(lo1):	mul	v1
   128  	add	w3, (rp)
   129  	mov	$0, R32(w2)
   130  	adc	%rax, w0
   131  	mov	(up), %rax
   132  	adc	%rdx, w1
   133  	mul	v0
   134  	add	%rax, w0
   135  	mov	(up), %rax
   136  	adc	%rdx, w1
   137  	adc	$0, R32(w2)
   138  L(lo0):	mul	v1
   139  	add	w0, 8(rp)
   140  	adc	%rax, w1
   141  	mov	8(up), %rax
   142  	mov	$0, R32(w3)
   143  	adc	%rdx, w2
   144  	mul	v0
   145  	add	%rax, w1
   146  	mov	8(up), %rax
   147  	adc	%rdx, w2
   148  	adc	$0, R32(w3)
   149  L(lo3):	mul	v1
   150  	add	w1, 16(rp)
   151  	adc	%rax, w2
   152  	mov	16(up), %rax
   153  	mov	$0, R32(w0)
   154  	adc	%rdx, w3
   155  	mul	v0
   156  	add	%rax, w2
   157  	mov	16(up), %rax
   158  	adc	%rdx, w3
   159  	adc	$0, R32(w0)
   160  L(lo2):	mul	v1
   161  	add	w2, 24(rp)
   162  	adc	%rax, w3
   163  	mov	24(up), %rax
   164  	adc	%rdx, w0
   165  	mov	$0, R32(w1)
   166  	lea	32(rp), rp
   167  	mul	v0
   168  	lea	32(up), up
   169  	add	%rax, w3
   170  	adc	%rdx, w0
   171  	mov	-8(up), %rax
   172  	adc	$0, R32(w1)
   173  	sub	$4, n
   174  	ja	L(top)
   175  
   176  L(end):	mul	v1
   177  	add	w3, (rp)
   178  	adc	%rax, w0
   179  	adc	%rdx, w1
   180  	mov	w0, 8(rp)
   181  	mov	w1, %rax
   182  	pop	%rbp
   183  	pop	%rbx
   184  	FUNC_EXIT()
   185  	ret
   186  EPILOGUE()