github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium4/sse2/mul_1.asm (about)

     1  dnl  mpn_mul_1 for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
     2  
     3  dnl  Copyright 2005, 2007, 2011 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  
    32  include(`../config.m4')
    33  
    34  C                           cycles/limb
    35  C P6 model 0-8,10-12		-
    36  C P6 model 9   (Banias)		4.17
    37  C P6 model 13  (Dothan)		4.17
    38  C P4 model 0-1 (Willamette)	4
    39  C P4 model 2   (Northwood)	4
    40  C P4 model 3-4 (Prescott)	4.55
    41  
    42  C TODO:
    43  C  * Tweak eax/edx offsets in loop as to save some lea's
    44  C  * Perhaps software pipeline small-case code
    45  
    46  C INPUT PARAMETERS
    47  C rp		sp + 4
    48  C up		sp + 8
    49  C n		sp + 12
    50  C v0		sp + 16
    51  
    52  	TEXT
    53  	ALIGN(16)
    54  PROLOGUE(mpn_mul_1)
    55  	pxor	%mm6, %mm6
    56  L(ent):	mov	4(%esp), %edx
    57  	mov	8(%esp), %eax
    58  	mov	12(%esp), %ecx
    59  	movd	16(%esp), %mm7
    60  	cmp	$4, %ecx
    61  	jnc	L(big)
    62  
    63  L(lp0):	movd	(%eax), %mm0
    64  	lea	4(%eax), %eax
    65  	lea	4(%edx), %edx
    66  	pmuludq	%mm7, %mm0
    67  	paddq	%mm0, %mm6
    68  	movd	%mm6, -4(%edx)
    69  	psrlq	$32, %mm6
    70  	dec	%ecx
    71  	jnz	L(lp0)
    72  	movd	%mm6, %eax
    73  	emms
    74  	ret
    75  
    76  L(big):	and	$3, %ecx
    77  	je	L(0)
    78  	cmp	$2, %ecx
    79  	jc	L(1)
    80  	je	L(2)
    81  	jmp	L(3)			C FIXME: one case should fall through
    82  
    83  L(0):	movd	(%eax), %mm3
    84  	sub	12(%esp), %ecx		C loop count
    85  	lea	-16(%eax), %eax
    86  	lea	-12(%edx), %edx
    87  	pmuludq	%mm7, %mm3
    88  	movd	20(%eax), %mm0
    89  	pmuludq	%mm7, %mm0
    90  	movd	24(%eax), %mm1
    91  	jmp	L(00)
    92  
    93  L(1):	movd	(%eax), %mm2
    94  	sub	12(%esp), %ecx
    95  	lea	-12(%eax), %eax
    96  	lea	-8(%edx), %edx
    97  	pmuludq	%mm7, %mm2
    98  	movd	16(%eax), %mm3
    99  	pmuludq	%mm7, %mm3
   100  	movd	20(%eax), %mm0
   101  	jmp	L(01)
   102  
   103  L(2):	movd	(%eax), %mm1
   104  	sub	12(%esp), %ecx
   105  	lea	-8(%eax), %eax
   106  	lea	-4(%edx), %edx
   107  	pmuludq	%mm7, %mm1
   108  	movd	12(%eax), %mm2
   109  	pmuludq	%mm7, %mm2
   110  	movd	16(%eax), %mm3
   111  	jmp	L(10)
   112  
   113  L(3):	movd	(%eax), %mm0
   114  	sub	12(%esp), %ecx
   115  	lea	-4(%eax), %eax
   116  	pmuludq	%mm7, %mm0
   117  	movd	8(%eax), %mm1
   118  	pmuludq	%mm7, %mm1
   119  	movd	12(%eax), %mm2
   120  
   121  	ALIGN(16)
   122  L(top):	pmuludq	%mm7, %mm2
   123  	paddq	%mm0, %mm6
   124  	movd	16(%eax), %mm3
   125  	movd	%mm6, 0(%edx)
   126  	psrlq	$32, %mm6
   127  L(10):	pmuludq	%mm7, %mm3
   128  	paddq	%mm1, %mm6
   129  	movd	20(%eax), %mm0
   130  	movd	%mm6, 4(%edx)
   131  	psrlq	$32, %mm6
   132  L(01):	pmuludq	%mm7, %mm0
   133  	paddq	%mm2, %mm6
   134  	movd	24(%eax), %mm1
   135  	movd	%mm6, 8(%edx)
   136  	psrlq	$32, %mm6
   137  L(00):	pmuludq	%mm7, %mm1
   138  	paddq	%mm3, %mm6
   139  	movd	28(%eax), %mm2
   140  	movd	%mm6, 12(%edx)
   141  	psrlq	$32, %mm6
   142  	lea	16(%eax), %eax
   143  	lea	16(%edx), %edx
   144  	add	$4, %ecx
   145  	ja	L(top)
   146  
   147  L(end):	pmuludq	%mm7, %mm2
   148  	paddq	%mm0, %mm6
   149  	movd	%mm6, 0(%edx)
   150  	psrlq	$32, %mm6
   151  	paddq	%mm1, %mm6
   152  	movd	%mm6, 4(%edx)
   153  	psrlq	$32, %mm6
   154  	paddq	%mm2, %mm6
   155  	movd	%mm6, 8(%edx)
   156  	psrlq	$32, %mm6
   157  	movd	%mm6, %eax
   158  	emms
   159  	ret
   160  EPILOGUE()
   161  PROLOGUE(mpn_mul_1c)
   162  	movd	20(%esp), %mm6
   163  	jmp	L(ent)
   164  EPILOGUE()