github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/atom/mul_2.asm (about)

     1  dnl  AMD64 mpn_mul_2 optimised for Intel Atom.
     2  
     3  dnl  Copyright 2008, 2011-2013 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  C	     cycles/limb	best
    34  C AMD K8,K9
    35  C AMD K10
    36  C AMD bd1
    37  C AMD bd2
    38  C AMD bobcat
    39  C AMD jaguar
    40  C Intel P4
    41  C Intel PNR
    42  C Intel NHM
    43  C Intel SBR
    44  C Intel IBR
    45  C Intel HWL
    46  C Intel BWL
    47  C Intel atom	17.75		this
    48  C VIA nano
    49  
    50  C The loop of this code is the result of running a code generation and
    51  C optimisation tool suite written by David Harvey and Torbjorn Granlund.
    52  
    53  define(`rp',      `%rdi')   C rcx
    54  define(`up',      `%rsi')   C rdx
    55  define(`n_param', `%rdx')   C r8
    56  define(`vp',      `%rcx')   C r9
    57  
    58  define(`v0', `%r8')
    59  define(`v1', `%r9')
    60  define(`w0', `%rbx')
    61  define(`w1', `%rcx')
    62  define(`w2', `%rbp')
    63  define(`w3', `%r10')
    64  define(`n',  `%r11')
    65  
    66  ABI_SUPPORT(DOS64)
    67  ABI_SUPPORT(STD64)
    68  
    69  ASM_START()
    70  	TEXT
    71  	ALIGN(16)
    72  PROLOGUE(mpn_mul_2)
    73  	FUNC_ENTRY(4)
    74  	push	%rbx
    75  	push	%rbp
    76  
    77  	mov	(up), %rax
    78  
    79  	mov	(vp), v0
    80  	mov	8(vp), v1
    81  
    82  	mov	n_param, n
    83  	mul	v0
    84  
    85  	test	$1, R8(n)
    86  	jnz	L(bx1)
    87  
    88  L(bx0):	test	$2, R8(n)
    89  	jnz	L(b10)
    90  
    91  L(b00):	mov	%rax, w0
    92  	mov	(up), %rax
    93  	mov	%rdx, w1
    94  	xor	R32(w2), R32(w2)
    95  	lea	-8(rp), rp
    96  	jmp	L(lo0)
    97  
    98  L(b10):	mov	%rax, w2
    99  	mov	(up), %rax
   100  	mov	%rdx, w3
   101  	xor	R32(w0), R32(w0)
   102  	lea	-16(up), up
   103  	lea	-24(rp), rp
   104  	jmp	L(lo2)
   105  
   106  L(bx1):	test	$2, R8(n)
   107  	jnz	L(b11)
   108  
   109  L(b01):	mov	%rax, w3
   110  	mov	%rdx, w0
   111  	mov	(up), %rax
   112  	xor	R32(w1), R32(w1)
   113  	lea	8(up), up
   114  	dec	n
   115  	jmp	L(lo1)
   116  
   117  L(b11):	mov	%rax, w1
   118  	mov	(up), %rax
   119  	mov	%rdx, w2
   120  	xor	R32(w3), R32(w3)
   121  	lea	-8(up), up
   122  	lea	-16(rp), rp
   123  	jmp	L(lo3)
   124  
   125  	ALIGN(16)
   126  L(top):
   127  L(lo1):	mul	v1
   128  	add	%rax, w0
   129  	mov	(up), %rax
   130  	mov	$0, R32(w2)
   131  	mov	w3, (rp)
   132  	adc	%rdx, w1
   133  	mul	v0
   134  	add	%rax, w0
   135  	mov	(up), %rax
   136  	adc	%rdx, w1
   137  	adc	$0, R32(w2)
   138  L(lo0):	mul	v1
   139  	add	%rax, w1
   140  	mov	8(up), %rax
   141  	mov	w0, 8(rp)
   142  	adc	%rdx, w2
   143  	mul	v0
   144  	add	%rax, w1
   145  	mov	8(up), %rax
   146  	adc	%rdx, w2
   147  	mov	$0, R32(w3)
   148  	adc	$0, R32(w3)
   149  L(lo3):	mul	v1
   150  	add	%rax, w2
   151  	mov	16(up), %rax
   152  	mov	w1, 16(rp)
   153  	mov	$0, R32(w0)
   154  	adc	%rdx, w3
   155  	mul	v0
   156  	add	%rax, w2
   157  	mov	16(up), %rax
   158  	adc	%rdx, w3
   159  L(lo2):	mov	$0, R32(w1)
   160  	mov	w2, 24(rp)
   161  	adc	$0, R32(w0)
   162  	mul	v1
   163  	add	%rax, w3
   164  	mov	24(up), %rax
   165  	lea	32(up), up
   166  	adc	%rdx, w0
   167  	mul	v0
   168  	lea	32(rp), rp
   169  	add	%rax, w3
   170  	adc	%rdx, w0
   171  	mov	-8(up), %rax
   172  	adc	$0, R32(w1)
   173  	sub	$4, n
   174  	ja	L(top)
   175  
   176  L(end):	mul	v1
   177  	mov	w3, (rp)
   178  	add	%rax, w0
   179  	adc	%rdx, w1
   180  	mov	w0, 8(rp)
   181  	mov	w1, %rax
   182  	pop	%rbp
   183  	pop	%rbx
   184  	FUNC_EXIT()
   185  	ret
   186  EPILOGUE()