github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/bd1/mul_2.asm (about)

     1  dnl  AMD64 mpn_mul_2 optimised for AMD Bulldozer.
     2  
     3  dnl  Contributed to the GNU project by Torbjörn Granlund.
     4  
     5  dnl  Copyright 2008, 2011-2013 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  C	     cycles/limb
    36  C AMD K8,K9
    37  C AMD K10
    38  C AMD bull	4.36		average, quite fluctuating
    39  C AMD pile	4.38		slighty fluctuating
    40  C AMD steam
    41  C AMD bobcat
    42  C AMD jaguar
    43  C Intel P4
    44  C Intel core
    45  C Intel NHM
    46  C Intel SBR
    47  C Intel IBR
    48  C Intel HWL
    49  C Intel BWL
    50  C Intel atom
    51  C VIA nano
    52  
    53  C The loop of this code is the result of running a code generation and
    54  C optimisation tool suite written by David Harvey and Torbjorn Granlund.
    55  C Scheme: genxmul --mul
    56  
    57  define(`rp',      `%rdi')   C rcx
    58  define(`up',      `%rsi')   C rdx
    59  define(`n_param', `%rdx')   C r8
    60  define(`vp',      `%rcx')   C r9
    61  
    62  define(`v0', `%r8')
    63  define(`v1', `%r9')
    64  define(`w0', `%rbx')
    65  define(`w1', `%rcx')
    66  define(`w2', `%rbp')
    67  define(`w3', `%r10')
    68  define(`n',  `%r11')
    69  
    70  ABI_SUPPORT(DOS64)
    71  ABI_SUPPORT(STD64)
    72  
    73  ASM_START()
    74  	TEXT
    75  	ALIGN(32)
    76  PROLOGUE(mpn_mul_2)
    77  	FUNC_ENTRY(4)
    78  	push	%rbx
    79  	push	%rbp
    80  
    81  	mov	(up), %rax
    82  
    83  	mov	(vp), v0
    84  	mov	8(vp), v1
    85  
    86  	lea	(up,n_param,8), up
    87  	lea	(rp,n_param,8), rp
    88  
    89  	mov	n_param, n
    90  	mul	v0
    91  	neg	n
    92  
    93  	test	$1, R8(n)
    94  	jnz	L(bx1)
    95  
    96  L(bx0):	test	$2, R8(n)
    97  	jnz	L(b10)
    98  
    99  L(b00):	mov	%rax, w0
   100  	mov	%rdx, w1
   101  	xor	R32(w2), R32(w2)
   102  	mov	(up,n,8), %rax
   103  	jmp	L(lo0)
   104  
   105  L(b10):	mov	%rax, w2
   106  	mov	%rdx, w3
   107  	mov	(up,n,8), %rax
   108  	xor	R32(w0), R32(w0)
   109  	mul	v1
   110  	add	$-2, n
   111  	jmp	L(lo2)
   112  
   113  L(bx1):	test	$2, R8(n)
   114  	jz	L(b11)
   115  
   116  L(b01):	mov	%rax, w3
   117  	mov	%rdx, w0
   118  	mov	(up,n,8), %rax
   119  	mul	v1
   120  	xor	R32(w1), R32(w1)
   121  	inc	n
   122  	jmp	L(lo1)
   123  
   124  L(b11):	mov	%rax, w1
   125  	mov	%rdx, w2
   126  	mov	(up,n,8), %rax
   127  	xor	R32(w3), R32(w3)
   128  	dec	n
   129  	jmp	L(lo3)
   130  
   131  	ALIGN(32)
   132  L(top):	mov	-8(up,n,8), %rax
   133  	mul	v1
   134  	mov	w2, -16(rp,n,8)
   135  L(lo1):	add	%rax, w0
   136  	mov	w3, -8(rp,n,8)
   137  	adc	%rdx, w1
   138  	mov	(up,n,8), %rax
   139  	mul	v0
   140  	mov	$0, R32(w2)
   141  	add	%rax, w0
   142  	adc	%rdx, w1
   143  	adc	$0, R32(w2)
   144  	mov	(up,n,8), %rax
   145  L(lo0):	mul	v1
   146  	add	%rax, w1
   147  	adc	%rdx, w2
   148  	mov	8(up,n,8), %rax
   149  	mul	v0
   150  	add	%rax, w1
   151  	mov	w0, (rp,n,8)
   152  	mov	$0, R32(w3)
   153  	mov	8(up,n,8), %rax
   154  	adc	%rdx, w2
   155  	adc	$0, R32(w3)
   156  L(lo3):	mul	v1
   157  	add	%rax, w2
   158  	mov	16(up,n,8), %rax
   159  	adc	%rdx, w3
   160  	mul	v0
   161  	add	%rax, w2
   162  	mov	16(up,n,8), %rax
   163  	mov	$0, R32(w0)
   164  	adc	%rdx, w3
   165  	adc	$0, R32(w0)
   166  	mul	v1
   167  	mov	w1, 8(rp,n,8)
   168  L(lo2):	add	%rax, w3
   169  	adc	%rdx, w0
   170  	mov	24(up,n,8), %rax
   171  	mul	v0
   172  	add	%rax, w3
   173  	adc	%rdx, w0
   174  	mov	$0, R32(w1)
   175  	adc	$0, R32(w1)
   176  	add	$4, n
   177  	jnc	L(top)
   178  
   179  L(end):	mov	-8(up,n,8), %rax
   180  	mul	v1
   181  	mov	w2, -16(rp,n,8)
   182  	add	%rax, w0
   183  	mov	w3, -8(rp,n,8)
   184  	adc	%rdx, w1
   185  	mov	w0, (rp,n,8)
   186  	mov	w1, %rax
   187  
   188  	pop	%rbp
   189  	pop	%rbx
   190  	FUNC_EXIT()
   191  	ret
   192  EPILOGUE()