github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/mulx/aorsmul_1.asm (about)

     1  dnl  AMD64 mpn_addmul_1 and mpn_submul_1 for CPUs with mulx.
     2  
     3  dnl  Copyright 2012, 2013 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  C	     cycles/limb
    34  C AMD K8,K9	 -
    35  C AMD K10	 -
    36  C AMD bd1	 -
    37  C AMD bd2	 ?
    38  C AMD bobcat	 -
    39  C AMD jaguar	 ?
    40  C Intel P4	 -
    41  C Intel PNR	 -
    42  C Intel NHM	 -
    43  C Intel SBR	 -
    44  C Intel HWL	 ?
    45  C Intel BWL	 ?
    46  C Intel atom	 -
    47  C VIA nano	 -
    48  
    49  define(`rp',      `%rdi')   C rcx
    50  define(`up',      `%rsi')   C rdx
    51  define(`n_param', `%rdx')   C r8
    52  define(`v0_param',`%rcx')   C r9
    53  
    54  define(`n',       `%rcx')
    55  define(`v0',      `%rdx')
    56  
    57  ifdef(`OPERATION_addmul_1',`
    58        define(`ADDSUB',        `add')
    59        define(`ADCSBB',        `adc')
    60        define(`func',  `mpn_addmul_1')
    61  ')
    62  ifdef(`OPERATION_submul_1',`
    63        define(`ADDSUB',        `sub')
    64        define(`ADCSBB',        `sbb')
    65        define(`func',  `mpn_submul_1')
    66  ')
    67  
    68  MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
    69  
    70  IFDOS(`	define(`up', ``%rsi'')	') dnl
    71  IFDOS(`	define(`rp', ``%rcx'')	') dnl
    72  IFDOS(`	define(`vl', ``%r9'')	') dnl
    73  IFDOS(`	define(`r9', ``rdi'')	') dnl
    74  IFDOS(`	define(`n',  ``%r8'')	') dnl
    75  IFDOS(`	define(`r8', ``r11'')	') dnl
    76  
    77  ASM_START()
    78  	TEXT
    79  	ALIGN(16)
    80  PROLOGUE(func)
    81  	mov	(up), %r8
    82  
    83  	push	%rbx
    84  	push	%r12
    85  	push	%r13
    86  
    87  	lea	(up,n_param,8), up
    88  	lea	-32(rp,n_param,8), rp
    89  	mov	R32(n_param), R32(%rax)
    90  	xchg	v0_param, v0		C FIXME: is this insn fast?
    91  
    92  	neg	n
    93  
    94  	and	$3, R8(%rax)
    95  	jz	L(b0)
    96  	cmp	$2, R8(%rax)
    97  	jz	L(b2)
    98  	jg	L(b3)
    99  
   100  L(b1):	mulx	%r8, %rbx, %rax
   101  	sub	$-1, n
   102  	jz	L(wd1)
   103  	mulx	(up,n,8), %r9, %r8
   104  	mulx	8(up,n,8), %r11, %r10
   105  	test	R32(%rax), R32(%rax)		C clear cy
   106  	jmp	L(lo1)
   107  
   108  L(b0):	mulx	%r8, %r9, %r8
   109  	mulx	8(up,n,8), %r11, %r10
   110  	mulx	16(up,n,8), %r13, %r12
   111  	xor	R32(%rax), R32(%rax)
   112  	jmp	L(lo0)
   113  
   114  L(b3):	mulx	%r8, %r11, %r10
   115  	mulx	8(up,n,8), %r13, %r12
   116  	mulx	16(up,n,8), %rbx, %rax
   117  	add	%r10, %r13
   118  	adc	%r12, %rbx
   119  	adc	$0, %rax
   120  	sub	$-3, n
   121  	jz	L(wd3)
   122  	test	R32(%rax), R32(%rax)		C clear cy
   123  	jmp	L(lo3)
   124  
   125  L(b2):	mulx	%r8, %r13, %r12
   126  	mulx	8(up,n,8), %rbx, %rax
   127  	add	%r12, %rbx
   128  	adc	$0, %rax
   129  	sub	$-2, n
   130  	jz	L(wd2)
   131  	mulx	(up,n,8), %r9, %r8
   132  	test	R32(%rax), R32(%rax)		C clear cy
   133  	jmp	L(lo2)
   134  
   135  L(top):	ADDSUB	%r9, (rp,n,8)
   136  L(lo3):	mulx	(up,n,8), %r9, %r8
   137  	ADCSBB	%r11, 8(rp,n,8)
   138  L(lo2):	mulx	8(up,n,8), %r11, %r10
   139  	ADCSBB	%r13, 16(rp,n,8)
   140  L(lo1):	mulx	16(up,n,8), %r13, %r12
   141  	ADCSBB	%rbx, 24(rp,n,8)
   142  	adc	%rax, %r9
   143  L(lo0):	mulx	24(up,n,8), %rbx, %rax
   144  	adc	%r8, %r11
   145  	adc	%r10, %r13
   146  	adc	%r12, %rbx
   147  	adc	$0, %rax		C rax = carry limb
   148  	add	$4, n
   149  	js	L(top)
   150  
   151  L(end):	ADDSUB	%r9, (rp)
   152  L(wd3):	ADCSBB	%r11, 8(rp)
   153  L(wd2):	ADCSBB	%r13, 16(rp)
   154  L(wd1):	ADCSBB	%rbx, 24(rp)
   155  	adc	n, %rax
   156  	pop	%r13
   157  	pop	%r12
   158  	pop	%rbx
   159  	ret
   160  EPILOGUE()
   161  ASM_END()