github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/atom/sse2/aorsmul_1.asm (about)

     1  dnl x86-32 mpn_addmul_1 and mpn_submul_1 optimised for Intel Atom.
     2  
     3  dnl  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
     4  
     5  dnl  Copyright 2011 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  C			    cycles/limb
    36  C			    cycles/limb
    37  C P5				 -
    38  C P6 model 0-8,10-12		 -
    39  C P6 model 9  (Banias)
    40  C P6 model 13 (Dothan)
    41  C P4 model 0  (Willamette)
    42  C P4 model 1  (?)
    43  C P4 model 2  (Northwood)
    44  C P4 model 3  (Prescott)
    45  C P4 model 4  (Nocona)
    46  C Intel Atom			 8
    47  C AMD K6
    48  C AMD K7			 -
    49  C AMD K8
    50  C AMD K10
    51  
    52  define(`rp', `%edi')
    53  define(`up', `%esi')
    54  define(`n',  `%ecx')
    55  
    56  ifdef(`OPERATION_addmul_1',`
    57  	define(ADDSUB,  add)
    58  	define(func_1,  mpn_addmul_1)
    59  	define(func_1c, mpn_addmul_1c)')
    60  ifdef(`OPERATION_submul_1',`
    61  	define(ADDSUB,  sub)
    62  	define(func_1,  mpn_submul_1)
    63  	define(func_1c, mpn_submul_1c)')
    64  
    65  MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
    66  
    67  	TEXT
    68  	ALIGN(16)
    69  PROLOGUE(func_1)
    70  	xor	%edx, %edx
    71  L(ent):	push	%edi
    72  	push	%esi
    73  	push	%ebx
    74  	mov	16(%esp), rp
    75  	mov	20(%esp), up
    76  	mov	24(%esp), n
    77  	movd	28(%esp), %mm7
    78  	test	$1, n
    79  	jz	L(fi0or2)
    80  	movd	(up), %mm0
    81  	pmuludq	%mm7, %mm0
    82  	shr	$2, n
    83  	jnc	L(fi1)
    84  
    85  L(fi3):	lea	-8(up), up
    86  	lea	-8(rp), rp
    87  	movd	12(up), %mm1
    88  	movd	%mm0, %ebx
    89  	pmuludq	%mm7, %mm1
    90  	add	$1, n			C increment and clear carry
    91  	jmp	L(lo3)
    92  
    93  L(fi1):	movd	%mm0, %ebx
    94  	jz	L(wd1)
    95  	movd	4(up), %mm1
    96  	pmuludq	%mm7, %mm1
    97  	jmp	L(lo1)
    98  
    99  L(fi0or2):
   100  	movd	(up), %mm1
   101  	pmuludq	%mm7, %mm1
   102  	shr	$2, n
   103  	movd	4(up), %mm0
   104  	jc	L(fi2)
   105  	lea	-4(up), up
   106  	lea	-4(rp), rp
   107  	movd	%mm1, %eax
   108  	pmuludq	%mm7, %mm0
   109  	jmp	L(lo0)
   110  
   111  L(fi2):	lea	4(up), up
   112  	add	$1, n			C increment and clear carry
   113  	movd	%mm1, %eax
   114  	lea	-12(rp), rp
   115  	jmp	L(lo2)
   116  
   117  C	ALIGN(16)			C alignment seems irrelevant
   118  L(top):	movd	4(up), %mm1
   119  	adc	$0, %edx
   120  	ADDSUB	%eax, 12(rp)
   121  	movd	%mm0, %ebx
   122  	pmuludq	%mm7, %mm1
   123  	lea	16(rp), rp
   124  L(lo1):	psrlq	$32, %mm0
   125  	adc	%edx, %ebx
   126  	movd	%mm0, %edx
   127  	movd	%mm1, %eax
   128  	movd	8(up), %mm0
   129  	pmuludq	%mm7, %mm0
   130  	adc	$0, %edx
   131  	ADDSUB	%ebx, (rp)
   132  L(lo0):	psrlq	$32, %mm1
   133  	adc	%edx, %eax
   134  	movd	%mm1, %edx
   135  	movd	%mm0, %ebx
   136  	movd	12(up), %mm1
   137  	pmuludq	%mm7, %mm1
   138  	adc	$0, %edx
   139  	ADDSUB	%eax, 4(rp)
   140  L(lo3):	psrlq	$32, %mm0
   141  	adc	%edx, %ebx
   142  	movd	%mm0, %edx
   143  	movd	%mm1, %eax
   144  	lea	16(up), up
   145  	movd	(up), %mm0
   146  	adc	$0, %edx
   147  	ADDSUB	%ebx, 8(rp)
   148  L(lo2):	psrlq	$32, %mm1
   149  	adc	%edx, %eax
   150  	movd	%mm1, %edx
   151  	pmuludq	%mm7, %mm0
   152  	dec	n
   153  	jnz	L(top)
   154  
   155  L(end):	adc	n, %edx			C n is zero here
   156  	ADDSUB	%eax, 12(rp)
   157  	movd	%mm0, %ebx
   158  	lea	16(rp), rp
   159  L(wd1):	psrlq	$32, %mm0
   160  	adc	%edx, %ebx
   161  	movd	%mm0, %eax
   162  	adc	n, %eax
   163  	ADDSUB	%ebx, (rp)
   164  	emms
   165  	adc	n, %eax
   166  	pop	%ebx
   167  	pop	%esi
   168  	pop	%edi
   169  	ret
   170  EPILOGUE()
   171  PROLOGUE(func_1c)
   172  	mov	20(%esp), %edx		C carry
   173  	jmp	L(ent)
   174  EPILOGUE()