github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium4/sse2/addmul_1.asm (about)

     1  dnl  mpn_addmul_1 for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
     2  
     3  dnl  Copyright 2005, 2007, 2011 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  
    32  include(`../config.m4')
    33  
    34  C			    cycles/limb
    35  C P6 model 0-8,10-12		-
    36  C P6 model 9   (Banias)		5.24
    37  C P6 model 13  (Dothan)		5.24
    38  C P4 model 0-1 (Willamette)	5
    39  C P4 model 2   (Northwood)	5
    40  C P4 model 3-4 (Prescott)	5
    41  
    42  C TODO:
    43  C  * Tweak eax/edx offsets in loop as to save some lea's
    44  C  * Perhaps software pipeline small-case code
    45  
    46  C INPUT PARAMETERS
    47  C rp		sp + 4
    48  C up		sp + 8
    49  C n		sp + 12
    50  C v0		sp + 16
    51  
    52  	TEXT
    53  	ALIGN(16)
    54  PROLOGUE(mpn_addmul_1)
    55  	pxor	%mm6, %mm6
    56  L(ent):	mov	4(%esp), %edx
    57  	mov	8(%esp), %eax
    58  	mov	12(%esp), %ecx
    59  	movd	16(%esp), %mm7
    60  	cmp	$4, %ecx
    61  	jnc	L(big)
    62  
    63  L(lp0):	movd	(%eax), %mm0
    64  	lea	4(%eax), %eax
    65  	movd	(%edx), %mm4
    66  	lea	4(%edx), %edx
    67  	pmuludq	%mm7, %mm0
    68  	paddq	%mm0, %mm4
    69  	paddq	%mm4, %mm6
    70  	movd	%mm6, -4(%edx)
    71  	psrlq	$32, %mm6
    72  	dec	%ecx
    73  	jnz	L(lp0)
    74  	movd	%mm6, %eax
    75  	emms
    76  	ret
    77  
    78  L(big):	and	$3, %ecx
    79  	je	L(0)
    80  	cmp	$2, %ecx
    81  	jc	L(1)
    82  	je	L(2)
    83  	jmp	L(3)			C FIXME: one case should fall through
    84  
    85  L(0):	movd	(%eax), %mm3
    86  	sub	12(%esp), %ecx		C loop count
    87  	lea	-16(%eax), %eax
    88  	lea	-12(%edx), %edx
    89  	pmuludq	%mm7, %mm3
    90  	movd	20(%eax), %mm0
    91  	movd	12(%edx), %mm5
    92  	pmuludq	%mm7, %mm0
    93  	movd	24(%eax), %mm1
    94  	paddq	%mm3, %mm5
    95  	movd	16(%edx), %mm4
    96  	jmp	L(00)
    97  
    98  L(1):	movd	(%eax), %mm2
    99  	sub	12(%esp), %ecx
   100  	lea	-12(%eax), %eax
   101  	lea	-8(%edx), %edx
   102  	movd	8(%edx), %mm4
   103  	pmuludq	%mm7, %mm2
   104  	movd	16(%eax), %mm3
   105  	pmuludq	%mm7, %mm3
   106  	movd	20(%eax), %mm0
   107  	paddq	%mm2, %mm4
   108  	movd	12(%edx), %mm5
   109  	jmp	L(01)
   110  
   111  L(2):	movd	(%eax), %mm1
   112  	sub	12(%esp), %ecx
   113  	lea	-8(%eax), %eax
   114  	lea	-4(%edx), %edx
   115  	pmuludq	%mm7, %mm1
   116  	movd	12(%eax), %mm2
   117  	movd	4(%edx), %mm5
   118  	pmuludq	%mm7, %mm2
   119  	movd	16(%eax), %mm3
   120  	paddq	%mm1, %mm5
   121  	movd	8(%edx), %mm4
   122  	jmp	L(10)
   123  
   124  L(3):	movd	(%eax), %mm0
   125  	sub	12(%esp), %ecx
   126  	lea	-4(%eax), %eax
   127  	pmuludq	%mm7, %mm0
   128  	movd	8(%eax), %mm1
   129  	movd	(%edx), %mm4
   130  	pmuludq	%mm7, %mm1
   131  	movd	12(%eax), %mm2
   132  	paddq	%mm0, %mm4
   133  	movd	4(%edx), %mm5
   134  
   135  	ALIGN(16)
   136  L(top):	pmuludq	%mm7, %mm2
   137  	paddq	%mm4, %mm6
   138  	movd	16(%eax), %mm3
   139  	paddq	%mm1, %mm5
   140  	movd	8(%edx), %mm4
   141  	movd	%mm6, 0(%edx)
   142  	psrlq	$32, %mm6
   143  L(10):	pmuludq	%mm7, %mm3
   144  	paddq	%mm5, %mm6
   145  	movd	20(%eax), %mm0
   146  	paddq	%mm2, %mm4
   147  	movd	12(%edx), %mm5
   148  	movd	%mm6, 4(%edx)
   149  	psrlq	$32, %mm6
   150  L(01):	pmuludq	%mm7, %mm0
   151  	paddq	%mm4, %mm6
   152  	movd	24(%eax), %mm1
   153  	paddq	%mm3, %mm5
   154  	movd	16(%edx), %mm4
   155  	movd	%mm6, 8(%edx)
   156  	psrlq	$32, %mm6
   157  L(00):	pmuludq	%mm7, %mm1
   158  	paddq	%mm5, %mm6
   159  	movd	28(%eax), %mm2
   160  	paddq	%mm0, %mm4
   161  	movd	20(%edx), %mm5
   162  	movd	%mm6, 12(%edx)
   163  	psrlq	$32, %mm6
   164  	lea	16(%eax), %eax
   165  	lea	16(%edx), %edx
   166  	add	$4, %ecx
   167  	jnz	L(top)
   168  
   169  L(end):	pmuludq	%mm7, %mm2
   170  	paddq	%mm4, %mm6
   171  	paddq	%mm1, %mm5
   172  	movd	8(%edx), %mm4
   173  	movd	%mm6, 0(%edx)
   174  	psrlq	$32, %mm6
   175  	paddq	%mm5, %mm6
   176  	paddq	%mm2, %mm4
   177  	movd	%mm6, 4(%edx)
   178  	psrlq	$32, %mm6
   179  	paddq	%mm4, %mm6
   180  	movd	%mm6, 8(%edx)
   181  	psrlq	$32, %mm6
   182  	movd	%mm6, %eax
   183  	emms
   184  	ret
   185  EPILOGUE()
   186  PROLOGUE(mpn_addmul_1c)
   187  	movd	20(%esp), %mm6
   188  	jmp	L(ent)
   189  EPILOGUE()