github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium4/sse2/mul_basecase.asm (about)

     1  dnl  mpn_mul_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
     2  
     3  dnl  Copyright 2001, 2002, 2005, 2007 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  C TODO:
    34  C  * Improve ad-hoc outer loop code and register handling.  Some feed-in
    35  C    scheduling could improve things by several cycles per outer iteration.
    36  C  * In code for un <= 3, try keeping accumulation operands in registers,
    37  C    without storing intermediates to rp.
    38  C  * We might want to keep 32 in a free mm register, since the register form is
    39  C    3 bytes and the immediate form is 4 bytes.  About 70 bytes to save.
    40  C  * Look into different loop alignment, we now expand the code about 50 bytes
    41  C    with possibly needless alignment.
    42  C  * Perhaps rewrap loops 00,01,02 (6 loops) to allow fall-through entry.
    43  C  * Use OSP, should solve feed-in latency problems.
    44  C  * Save a few tens of bytes by doing cross-jumping for Loel0, etc.
    45  C  * Save around 120 bytes by remapping "m 0", "m 1", "m 2" and "m 3" registers
    46  C    so that they can share feed-in code, and changing the branch targets from
    47  C    L<n> to Lm<nn>.
    48  
    49  C                           cycles/limb
    50  C P6 model 9   (Banias)         ?
    51  C P6 model 13  (Dothan)         5.24
    52  C P6 model 14  (Yonah)          ?
    53  C P4 model 0-1 (Willamette):    5
    54  C P4 model 2   (Northwood):     4.60 at 32 limbs
    55  C P4 model 3-4 (Prescott):      4.94 at 32 limbs
    56  
    57  C INPUT PARAMETERS
    58  C rp		sp + 4
    59  C up		sp + 8
    60  C un		sp + 12
    61  C vp		sp + 16
    62  C vn		sp + 20
    63  
    64  	TEXT
    65  	ALIGN(16)
    66  PROLOGUE(mpn_mul_basecase)
    67  	push	%esi
    68  	push	%ebx
    69  	mov	12(%esp), %edx		C rp
    70  	mov	16(%esp), %eax		C up
    71  	mov	20(%esp), %ecx		C un
    72  	mov	24(%esp), %esi		C vp
    73  	mov	28(%esp), %ebx		C vn
    74  	movd	(%esi), %mm7		C
    75  L(ent):	cmp	$3, %ecx
    76  	ja	L(big)
    77  	movd	(%eax), %mm6
    78  	pmuludq	%mm7, %mm6
    79  	jz	L(un3)
    80  	cmp	$2, %ecx
    81  	jz	L(un2)
    82  
    83  L(un1):	movd	%mm6, (%edx)		C				un=1
    84  	psrlq	$32, %mm6		C				un=1
    85  	movd	%mm6, 4(%edx)		C				un=1
    86  	jmp	L(rtr)			C				un=1
    87  
    88  L(un2):	movd	4(%eax), %mm1		C				un=2
    89  	pmuludq	%mm7, %mm1		C				un=2
    90  	movd	%mm6, (%edx)		C				un=2
    91  	psrlq	$32, %mm6		C				un=2
    92  	paddq	%mm1, %mm6		C				un=2
    93  	movd	%mm6, 4(%edx)		C				un=2
    94  	psrlq	$32, %mm6		C				un=2
    95  	movd	%mm6, 8(%edx)		C				un=2
    96        dec	%ebx			C				un=2
    97        jz	L(rtr)			C				un=2
    98  	movd	4(%esi), %mm7		C				un=2
    99  	movd	(%eax), %mm6		C				un=2
   100  	pmuludq	%mm7, %mm6		C				un=2
   101  	movd	4(%eax), %mm1		C				un=2
   102  	movd	4(%edx), %mm4		C				un=2
   103  	pmuludq	%mm7, %mm1		C				un=2
   104  	movd	8(%edx), %mm5		C				un=2
   105  	paddq	%mm4, %mm6		C				un=2
   106  	paddq	%mm1, %mm5		C				un=2
   107  	movd	%mm6, 4(%edx)		C				un=2
   108  	psrlq	$32, %mm6		C				un=2
   109  	paddq	%mm5, %mm6		C				un=2
   110  	movd	%mm6, 8(%edx)		C				un=2
   111  	psrlq	$32, %mm6		C				un=2
   112  	movd	%mm6, 12(%edx)		C				un=2
   113  L(rtr):	emms
   114  	pop	%ebx
   115  	pop	%esi
   116  	ret
   117  
   118  L(un3):	movd	4(%eax), %mm1		C				un=3
   119  	pmuludq	%mm7, %mm1		C				un=3
   120  	movd	8(%eax), %mm2		C				un=3
   121  	pmuludq	%mm7, %mm2		C				un=3
   122  	movd	%mm6, (%edx)		C				un=3
   123  	psrlq	$32, %mm6		C				un=3
   124  	paddq	%mm1, %mm6		C				un=3
   125  	movd	%mm6, 4(%edx)		C				un=3
   126  	psrlq	$32, %mm6		C				un=3
   127  	paddq	%mm2, %mm6		C				un=3
   128  	movd	%mm6, 8(%edx)		C				un=3
   129  	psrlq	$32, %mm6		C				un=3
   130  	movd	%mm6, 12(%edx)		C				un=3
   131        dec	%ebx			C				un=3
   132        jz	L(rtr)			C				un=3
   133  	movd	4(%esi), %mm7		C				un=3
   134  	movd	(%eax), %mm6		C				un=3
   135  	pmuludq	%mm7, %mm6		C				un=3
   136  	movd	4(%eax), %mm1		C				un=3
   137  	movd	4(%edx), %mm4		C				un=3
   138  	pmuludq	%mm7, %mm1		C				un=3
   139  	movd	8(%eax), %mm2		C				un=3
   140  	movd	8(%edx), %mm5		C				un=3
   141  	pmuludq	%mm7, %mm2		C				un=3
   142  	paddq	%mm4, %mm6		C				un=3
   143  	paddq	%mm1, %mm5		C				un=3
   144  	movd	12(%edx), %mm4		C				un=3
   145  	movd	%mm6, 4(%edx)		C				un=3
   146  	psrlq	$32, %mm6		C				un=3
   147  	paddq	%mm5, %mm6		C				un=3
   148  	paddq	%mm2, %mm4		C				un=3
   149  	movd	%mm6, 8(%edx)		C				un=3
   150  	psrlq	$32, %mm6		C				un=3
   151  	paddq	%mm4, %mm6		C				un=3
   152  	movd	%mm6, 12(%edx)		C				un=3
   153  	psrlq	$32, %mm6		C				un=3
   154  	movd	%mm6, 16(%edx)		C				un=3
   155        dec	%ebx			C				un=3
   156        jz	L(rtr)			C				un=3
   157  	movd	8(%esi), %mm7		C				un=3
   158  	movd	(%eax), %mm6		C				un=3
   159  	pmuludq	%mm7, %mm6		C				un=3
   160  	movd	4(%eax), %mm1		C				un=3
   161  	movd	8(%edx), %mm4		C				un=3
   162  	pmuludq	%mm7, %mm1		C				un=3
   163  	movd	8(%eax), %mm2		C				un=3
   164  	movd	12(%edx), %mm5		C				un=3
   165  	pmuludq	%mm7, %mm2		C				un=3
   166  	paddq	%mm4, %mm6		C				un=3
   167  	paddq	%mm1, %mm5		C				un=3
   168  	movd	16(%edx), %mm4		C				un=3
   169  	movd	%mm6, 8(%edx)		C				un=3
   170  	psrlq	$32, %mm6		C				un=3
   171  	paddq	%mm5, %mm6		C				un=3
   172  	paddq	%mm2, %mm4		C				un=3
   173  	movd	%mm6, 12(%edx)		C				un=3
   174  	psrlq	$32, %mm6		C				un=3
   175  	paddq	%mm4, %mm6		C				un=3
   176  	movd	%mm6, 16(%edx)		C				un=3
   177  	psrlq	$32, %mm6		C				un=3
   178  	movd	%mm6, 20(%edx)		C				un=3
   179  	jmp	L(rtr)
   180  
   181  
   182  L(big):	push	%edi
   183  	pxor	%mm6, %mm6
   184  	lea	4(%esi), %esi
   185  	and	$3, %ecx
   186  	jz	L(0)
   187  	cmp	$2, %ecx
   188  	jc	L(1)
   189  	jz	L(2)
   190  	jmp	L(3)			C FIXME: one case should fall through
   191  
   192  
   193  L(0):	movd	(%eax), %mm3		C				m 0
   194  	sub	24(%esp), %ecx		C inner loop count		m 0
   195  	mov	%ecx, 24(%esp)		C update loop count for later	m 0
   196  	pmuludq	%mm7, %mm3		C				m 0
   197  	movd	4(%eax), %mm0		C				m 0
   198  	pmuludq	%mm7, %mm0		C				m 0
   199  	movd	8(%eax), %mm1		C				m 0
   200  	jmp	L(m00)			C				m 0
   201  	ALIGN(16)			C				m 0
   202  L(lpm0):
   203  	pmuludq	%mm7, %mm4		C				m 0
   204  	paddq	%mm0, %mm6		C				m 0
   205  	movd	(%eax), %mm3		C				m 0
   206  	movd	%mm6, -12(%edx)		C				m 0
   207  	psrlq	$32, %mm6		C				m 0
   208  	pmuludq	%mm7, %mm3		C				m 0
   209  	paddq	%mm1, %mm6		C				m 0
   210  	movd	4(%eax), %mm0		C				m 0
   211  	movd	%mm6, -8(%edx)		C				m 0
   212  	psrlq	$32, %mm6		C				m 0
   213  	pmuludq	%mm7, %mm0		C				m 0
   214  	paddq	%mm4, %mm6		C				m 0
   215  	movd	8(%eax), %mm1		C				m 0
   216  	movd	%mm6, -4(%edx)		C				m 0
   217  	psrlq	$32, %mm6		C				m 0
   218  L(m00):	pmuludq	%mm7, %mm1		C				m 0
   219  	paddq	%mm3, %mm6		C				m 0
   220  	movd	12(%eax), %mm4		C				m 0
   221  	movd	%mm6, (%edx)		C				m 0
   222  	psrlq	$32, %mm6		C				m 0
   223  	lea	16(%eax), %eax		C				m 0
   224  	lea	16(%edx), %edx		C				m 0
   225  	add	$4, %ecx		C				m 0
   226  	ja	L(lpm0)			C				m 0
   227  	pmuludq	%mm7, %mm4		C				m 0
   228  	paddq	%mm0, %mm6		C				m 0
   229  	movd	%mm6, -12(%edx)		C				m 0
   230  	psrlq	$32, %mm6		C				m 0
   231  	paddq	%mm1, %mm6		C				m 0
   232  	mov	16(%esp), %edi		C rp				  0
   233  	jmp	L(x0)
   234  
   235  L(olp0):
   236  	lea	4(%edi), %edi		C				am 0
   237  	movd	(%esi), %mm7		C				am 0
   238  	lea	4(%esi), %esi		C				am 0
   239  	mov	%edi, %edx		C rp				am 0
   240  	mov	20(%esp), %eax		C up				am 0
   241  	movd	(%eax), %mm3		C				am 0
   242  	mov	24(%esp), %ecx		C inner loop count		am 0
   243  	pxor	%mm6, %mm6		C				am 0
   244  	pmuludq	%mm7, %mm3		C				am 0
   245  	movd	4(%eax), %mm0		C				am 0
   246  	movd	(%edx), %mm5		C				am 0
   247  	pmuludq	%mm7, %mm0		C				am 0
   248  	movd	8(%eax), %mm1		C				am 0
   249  	paddq	%mm3, %mm5		C				am 0
   250  	movd	4(%edx), %mm4		C				am 0
   251  	jmp	L(am00)			C				am 0
   252  	ALIGN(16)			C				mm 0
   253  L(lam0):
   254  	pmuludq	%mm7, %mm2		C				am 0
   255  	paddq	%mm4, %mm6		C				am 0
   256  	movd	(%eax), %mm3		C				am 0
   257  	paddq	%mm1, %mm5		C				am 0
   258  	movd	-4(%edx), %mm4		C				am 0
   259  	movd	%mm6, -12(%edx)		C				am 0
   260  	psrlq	$32, %mm6		C				am 0
   261  	pmuludq	%mm7, %mm3		C				am 0
   262  	paddq	%mm5, %mm6		C				am 0
   263  	movd	4(%eax), %mm0		C				am 0
   264  	paddq	%mm2, %mm4		C				am 0
   265  	movd	(%edx), %mm5		C				am 0
   266  	movd	%mm6, -8(%edx)		C				am 0
   267  	psrlq	$32, %mm6		C				am 0
   268  	pmuludq	%mm7, %mm0		C				am 0
   269  	paddq	%mm4, %mm6		C				am 0
   270  	movd	8(%eax), %mm1		C				am 0
   271  	paddq	%mm3, %mm5		C				am 0
   272  	movd	4(%edx), %mm4		C				am 0
   273  	movd	%mm6, -4(%edx)		C				am 0
   274  	psrlq	$32, %mm6		C				am 0
   275  L(am00):
   276  	pmuludq	%mm7, %mm1		C				am 0
   277  	paddq	%mm5, %mm6		C				am 0
   278  	movd	12(%eax), %mm2		C				am 0
   279  	paddq	%mm0, %mm4		C				am 0
   280  	movd	8(%edx), %mm5		C				am 0
   281  	movd	%mm6, (%edx)		C				am 0
   282  	psrlq	$32, %mm6		C				am 0
   283  	lea	16(%eax), %eax		C				am 0
   284  	lea	16(%edx), %edx		C				am 0
   285  	add	$4, %ecx		C				am 0
   286  	jnz	L(lam0)			C				am 0
   287  	pmuludq	%mm7, %mm2		C				am 0
   288  	paddq	%mm4, %mm6		C				am 0
   289  	paddq	%mm1, %mm5		C				am 0
   290  	movd	-4(%edx), %mm4		C				am 0
   291  	movd	%mm6, -12(%edx)		C				am 0
   292  	psrlq	$32, %mm6		C				am 0
   293  	paddq	%mm5, %mm6		C				am 0
   294  	paddq	%mm2, %mm4		C				am 0
   295  L(x0):	movd	%mm6, -8(%edx)		C				am 0
   296  	psrlq	$32, %mm6		C				am 0
   297  	paddq	%mm4, %mm6		C				am 0
   298  	movd	%mm6, -4(%edx)		C				am 0
   299  	psrlq	$32, %mm6		C				am 0
   300  	movd	%mm6, (%edx)		C				am 0
   301  	dec	%ebx			C				am 0
   302  	jnz	L(olp0)			C				am 0
   303  L(oel0):
   304  	emms				C				   0
   305  	pop	%edi			C				   0
   306  	pop	%ebx			C				   0
   307  	pop	%esi			C				   0
   308  	ret				C				   0
   309  
   310  
   311  L(1):	movd	(%eax), %mm4		C				m 1
   312  	sub	24(%esp), %ecx		C				m 1
   313  	mov	%ecx, 24(%esp)		C update loop count for later	m 1
   314  	pmuludq	%mm7, %mm4		C				m 1
   315  	movd	4(%eax), %mm3		C				m 1
   316  	pmuludq	%mm7, %mm3		C				m 1
   317  	movd	8(%eax), %mm0		C				m 1
   318  	jmp	L(m01)			C				m 1
   319  	ALIGN(16)			C				m 1
   320  L(lpm1):
   321  	pmuludq	%mm7, %mm4		C				m 1
   322  	paddq	%mm0, %mm6		C				m 1
   323  	movd	4(%eax), %mm3		C				m 1
   324  	movd	%mm6, -8(%edx)		C				m 1
   325  	psrlq	$32, %mm6		C				m 1
   326  	pmuludq	%mm7, %mm3		C				m 1
   327  	paddq	%mm1, %mm6		C				m 1
   328  	movd	8(%eax), %mm0		C				m 1
   329  	movd	%mm6, -4(%edx)		C				m 1
   330  	psrlq	$32, %mm6		C				m 1
   331  L(m01):	pmuludq	%mm7, %mm0		C				m 1
   332  	paddq	%mm4, %mm6		C				m 1
   333  	movd	12(%eax), %mm1		C				m 1
   334  	movd	%mm6, (%edx)		C				m 1
   335  	psrlq	$32, %mm6		C				m 1
   336  	pmuludq	%mm7, %mm1		C				m 1
   337  	paddq	%mm3, %mm6		C				m 1
   338  	movd	16(%eax), %mm4		C				m 1
   339  	movd	%mm6, 4(%edx)		C				m 1
   340  	psrlq	$32, %mm6		C				m 1
   341  	lea	16(%eax), %eax		C				m 1
   342  	lea	16(%edx), %edx		C				m 1
   343  	add	$4, %ecx		C				m 1
   344  	ja	L(lpm1)			C				m 1
   345  	pmuludq	%mm7, %mm4		C				m 1
   346  	paddq	%mm0, %mm6		C				m 1
   347  	movd	%mm6, -8(%edx)		C				m 1
   348  	psrlq	$32, %mm6		C				m 1
   349  	paddq	%mm1, %mm6		C				m 1
   350  	mov	16(%esp), %edi		C rp				  1
   351  	jmp	L(x1)
   352  
   353  L(olp1):
   354  	lea	4(%edi), %edi		C				am 1
   355  	movd	(%esi), %mm7		C				am 1
   356  	lea	4(%esi), %esi		C				am 1
   357  	mov	%edi, %edx		C rp				am 1
   358  	mov	20(%esp), %eax		C up				am 1
   359  	movd	(%eax), %mm2		C				am 1
   360  	mov	24(%esp), %ecx		C inner loop count		am 1
   361  	pxor	%mm6, %mm6		C				am 1
   362  	pmuludq	%mm7, %mm2		C				am 1
   363  	movd	4(%eax), %mm3		C				am 1
   364  	movd	(%edx), %mm4		C				am 1
   365  	pmuludq	%mm7, %mm3		C				am 1
   366  	movd	8(%eax), %mm0		C				am 1
   367  	paddq	%mm2, %mm4		C				am 1
   368  	movd	4(%edx), %mm5		C				am 1
   369  	jmp	L(am01)			C				am 1
   370  	ALIGN(16)			C				am 1
   371  L(lam1):
   372  	pmuludq	%mm7, %mm2		C				am 1
   373  	paddq	%mm4, %mm6		C				am 1
   374  	movd	4(%eax), %mm3		C				am 1
   375  	paddq	%mm1, %mm5		C				am 1
   376  	movd	(%edx), %mm4		C				am 1
   377  	movd	%mm6, -8(%edx)		C				am 1
   378  	psrlq	$32, %mm6		C				am 1
   379  	pmuludq	%mm7, %mm3		C				am 1
   380  	paddq	%mm5, %mm6		C				am 1
   381  	movd	8(%eax), %mm0		C				am 1
   382  	paddq	%mm2, %mm4		C				am 1
   383  	movd	4(%edx), %mm5		C				am 1
   384  	movd	%mm6, -4(%edx)		C				am 1
   385  	psrlq	$32, %mm6		C				am 1
   386  L(am01):
   387  	pmuludq	%mm7, %mm0		C				am 1
   388  	paddq	%mm4, %mm6		C				am 1
   389  	movd	12(%eax), %mm1		C				am 1
   390  	paddq	%mm3, %mm5		C				am 1
   391  	movd	8(%edx), %mm4		C				am 1
   392  	movd	%mm6, (%edx)		C				am 1
   393  	psrlq	$32, %mm6		C				am 1
   394  	pmuludq	%mm7, %mm1		C				am 1
   395  	paddq	%mm5, %mm6		C				am 1
   396  	movd	16(%eax), %mm2		C				am 1
   397  	paddq	%mm0, %mm4		C				am 1
   398  	movd	12(%edx), %mm5		C				am 1
   399  	movd	%mm6, 4(%edx)		C				am 1
   400  	psrlq	$32, %mm6		C				am 1
   401  	lea	16(%eax), %eax		C				am 1
   402  	lea	16(%edx), %edx		C				am 1
   403  	add	$4, %ecx		C				am 1
   404  	jnz	L(lam1)			C				am 1
   405  	pmuludq	%mm7, %mm2		C				am 1
   406  	paddq	%mm4, %mm6		C				am 1
   407  	paddq	%mm1, %mm5		C				am 1
   408  	movd	(%edx), %mm4		C				am 1
   409  	movd	%mm6, -8(%edx)		C				am 1
   410  	psrlq	$32, %mm6		C				am 1
   411  	paddq	%mm5, %mm6		C				am 1
   412  	paddq	%mm2, %mm4		C				am 1
   413  L(x1):	movd	%mm6, -4(%edx)		C				am 1
   414  	psrlq	$32, %mm6		C				am 1
   415  	paddq	%mm4, %mm6		C				am 1
   416  	movd	%mm6, (%edx)		C				am 1
   417  	psrlq	$32, %mm6		C				am 1
   418  	movd	%mm6, 4(%edx)		C				am 1
   419  	dec	%ebx			C				am 1
   420  	jnz	L(olp1)			C				am 1
   421  L(oel1):
   422  	emms				C				   1
   423  	pop	%edi			C				   1
   424  	pop	%ebx			C				   1
   425  	pop	%esi			C				   1
   426  	ret				C				   1
   427  
   428  
   429  L(2):	movd	(%eax), %mm1		C				m 2
   430  	sub	24(%esp), %ecx		C				m 2
   431  	mov	%ecx, 24(%esp)		C update loop count for later	m 2
   432  	pmuludq	%mm7, %mm1		C				m 2
   433  	movd	4(%eax), %mm4		C				m 2
   434  	pmuludq	%mm7, %mm4		C				m 2
   435  	movd	8(%eax), %mm3		C				m 2
   436  	jmp	L(m10)			C				m 2
   437  	ALIGN(16)			C				m 2
   438  L(lpm2):
   439  	pmuludq	%mm7, %mm4		C				m 2
   440  	paddq	%mm0, %mm6		C				m 2
   441  	movd	8(%eax), %mm3		C				m 2
   442  	movd	%mm6, -4(%edx)		C				m 2
   443  	psrlq	$32, %mm6		C				m 2
   444  L(m10):	pmuludq	%mm7, %mm3		C				m 2
   445  	paddq	%mm1, %mm6		C				m 2
   446  	movd	12(%eax), %mm0		C				m 2
   447  	movd	%mm6, (%edx)		C				m 2
   448  	psrlq	$32, %mm6		C				m 2
   449  	pmuludq	%mm7, %mm0		C				m 2
   450  	paddq	%mm4, %mm6		C				m 2
   451  	movd	16(%eax), %mm1		C				m 2
   452  	movd	%mm6, 4(%edx)		C				m 2
   453  	psrlq	$32, %mm6		C				m 2
   454  	pmuludq	%mm7, %mm1		C				m 2
   455  	paddq	%mm3, %mm6		C				m 2
   456  	movd	20(%eax), %mm4		C				m 2
   457  	movd	%mm6, 8(%edx)		C				m 2
   458  	psrlq	$32, %mm6		C				m 2
   459  	lea	16(%eax), %eax		C				m 2
   460  	lea	16(%edx), %edx		C				m 2
   461  	add	$4, %ecx		C				m 2
   462  	ja	L(lpm2)			C				m 2
   463  	pmuludq	%mm7, %mm4		C				m 2
   464  	paddq	%mm0, %mm6		C				m 2
   465  	movd	%mm6, -4(%edx)		C				m 2
   466  	psrlq	$32, %mm6		C				m 2
   467  	paddq	%mm1, %mm6		C				m 2
   468  	mov	16(%esp), %edi		C rp				  2
   469  	jmp	L(x2)
   470  
   471  L(olp2):
   472  	lea	4(%edi), %edi		C				am 2
   473  	movd	(%esi), %mm7		C				am 2
   474  	lea	4(%esi), %esi		C				am 2
   475  	mov	%edi, %edx		C rp				am 2
   476  	mov	20(%esp), %eax		C up				am 2
   477  	movd	(%eax), %mm1		C				am 2
   478  	mov	24(%esp), %ecx		C inner loop count		am 2
   479  	pxor	%mm6, %mm6		C				am 2
   480  	pmuludq	%mm7, %mm1		C				am 2
   481  	movd	4(%eax), %mm2		C				am 2
   482  	movd	(%edx), %mm5		C				am 2
   483  	pmuludq	%mm7, %mm2		C				am 2
   484  	movd	8(%eax), %mm3		C				am 2
   485  	paddq	%mm1, %mm5		C				am 2
   486  	movd	4(%edx), %mm4		C				am 2
   487  	jmp	L(am10)			C				am 2
   488  	ALIGN(16)			C				am 2
   489  L(lam2):
   490  	pmuludq	%mm7, %mm2		C				am 2
   491  	paddq	%mm4, %mm6		C				am 2
   492  	movd	8(%eax), %mm3		C				am 2
   493  	paddq	%mm1, %mm5		C				am 2
   494  	movd	4(%edx), %mm4		C				am 2
   495  	movd	%mm6, -4(%edx)		C				am 2
   496  	psrlq	$32, %mm6		C				am 2
   497  L(am10):
   498  	pmuludq	%mm7, %mm3		C				am 2
   499  	paddq	%mm5, %mm6		C				am 2
   500  	movd	12(%eax), %mm0		C				am 2
   501  	paddq	%mm2, %mm4		C				am 2
   502  	movd	8(%edx), %mm5		C				am 2
   503  	movd	%mm6, (%edx)		C				am 2
   504  	psrlq	$32, %mm6		C				am 2
   505  	pmuludq	%mm7, %mm0		C				am 2
   506  	paddq	%mm4, %mm6		C				am 2
   507  	movd	16(%eax), %mm1		C				am 2
   508  	paddq	%mm3, %mm5		C				am 2
   509  	movd	12(%edx), %mm4		C				am 2
   510  	movd	%mm6, 4(%edx)		C				am 2
   511  	psrlq	$32, %mm6		C				am 2
   512  	pmuludq	%mm7, %mm1		C				am 2
   513  	paddq	%mm5, %mm6		C				am 2
   514  	movd	20(%eax), %mm2		C				am 2
   515  	paddq	%mm0, %mm4		C				am 2
   516  	movd	16(%edx), %mm5		C				am 2
   517  	movd	%mm6, 8(%edx)		C				am 2
   518  	psrlq	$32, %mm6		C				am 2
   519  	lea	16(%eax), %eax		C				am 2
   520  	lea	16(%edx), %edx		C				am 2
   521  	add	$4, %ecx		C				am 2
   522  	jnz	L(lam2)			C				am 2
   523  	pmuludq	%mm7, %mm2		C				am 2
   524  	paddq	%mm4, %mm6		C				am 2
   525  	paddq	%mm1, %mm5		C				am 2
   526  	movd	4(%edx), %mm4		C				am 2
   527  	movd	%mm6, -4(%edx)		C				am 2
   528  	psrlq	$32, %mm6		C				am 2
   529  	paddq	%mm5, %mm6		C				am 2
   530  	paddq	%mm2, %mm4		C				am 2
   531  L(x2):	movd	%mm6, (%edx)		C				am 2
   532  	psrlq	$32, %mm6		C				am 2
   533  	paddq	%mm4, %mm6		C				am 2
   534  	movd	%mm6, 4(%edx)		C				am 2
   535  	psrlq	$32, %mm6		C				am 2
   536  	movd	%mm6, 8(%edx)		C				am 2
   537  	dec	%ebx			C				am 2
   538  	jnz	L(olp2)			C				am 2
   539  L(oel2):
   540  	emms				C				   2
   541  	pop	%edi			C				   2
   542  	pop	%ebx			C				   2
   543  	pop	%esi			C				   2
   544  	ret				C				   2
   545  
   546  
   547  L(3):	movd	(%eax), %mm0		C				m 3
   548  	sub	24(%esp), %ecx		C				m 3
   549  	mov	%ecx, 24(%esp)		C update loop count for later	m 3
   550  	pmuludq	%mm7, %mm0		C				m 3
   551  	movd	4(%eax), %mm1		C				m 3
   552  	pmuludq	%mm7, %mm1		C				m 3
   553  	movd	8(%eax), %mm4		C				m 3
   554  	jmp	L(lpm3)			C				m 3
   555  	ALIGN(16)			C				m 3
   556  L(lpm3):
   557  	pmuludq	%mm7, %mm4		C				m 3
   558  	paddq	%mm0, %mm6		C				m 3
   559  	movd	12(%eax), %mm3		C				m 3
   560  	movd	%mm6, (%edx)		C				m 3
   561  	psrlq	$32, %mm6		C				m 3
   562  	pmuludq	%mm7, %mm3		C				m 3
   563  	paddq	%mm1, %mm6		C				m 3
   564  	movd	16(%eax), %mm0		C				m 3
   565  	movd	%mm6, 4(%edx)		C				m 3
   566  	psrlq	$32, %mm6		C				m 3
   567  	pmuludq	%mm7, %mm0		C				m 3
   568  	paddq	%mm4, %mm6		C				m 3
   569  	movd	20(%eax), %mm1		C				m 3
   570  	movd	%mm6, 8(%edx)		C				m 3
   571  	psrlq	$32, %mm6		C				m 3
   572  	pmuludq	%mm7, %mm1		C				m 3
   573  	paddq	%mm3, %mm6		C				m 3
   574  	movd	24(%eax), %mm4		C				m 3
   575  	movd	%mm6, 12(%edx)		C				m 3
   576  	psrlq	$32, %mm6		C				m 3
   577  	lea	16(%eax), %eax		C				m 3
   578  	lea	16(%edx), %edx		C				m 3
   579  	add	$4, %ecx		C				m 3
   580  	ja	L(lpm3)			C				m 3
   581  	pmuludq	%mm7, %mm4		C				m 3
   582  	paddq	%mm0, %mm6		C				m 3
   583  	movd	%mm6, (%edx)		C				m 3
   584  	psrlq	$32, %mm6		C				m 3
   585  	paddq	%mm1, %mm6		C				m 3
   586  	mov	16(%esp), %edi		C rp				  3
   587  	jmp	L(x3)
   588  
   589  L(olp3):
   590  	lea	4(%edi), %edi		C				am 3
   591  	movd	(%esi), %mm7		C				am 3
   592  	lea	4(%esi), %esi		C				am 3
   593  	mov	%edi, %edx		C rp				am 3
   594  	mov	20(%esp), %eax		C up				am 3
   595  	movd	(%eax), %mm0		C				am 3
   596  	mov	24(%esp), %ecx		C inner loop count		am 3
   597  	pxor	%mm6, %mm6		C				am 3
   598  	pmuludq	%mm7, %mm0		C				am 3
   599  	movd	4(%eax), %mm1		C				am 3
   600  	movd	(%edx), %mm4		C				am 3
   601  	pmuludq	%mm7, %mm1		C				am 3
   602  	movd	8(%eax), %mm2		C				am 3
   603  	paddq	%mm0, %mm4		C				am 3
   604  	movd	4(%edx), %mm5		C				am 3
   605  	jmp	L(lam3)			C				am 3
   606  	ALIGN(16)			C				am 3
   607  L(lam3):
   608  	pmuludq	%mm7, %mm2		C				am 3
   609  	paddq	%mm4, %mm6		C				am 3
   610  	movd	12(%eax), %mm3		C				am 3
   611  	paddq	%mm1, %mm5		C				am 3
   612  	movd	8(%edx), %mm4		C				am 3
   613  	movd	%mm6, (%edx)		C				am 3
   614  	psrlq	$32, %mm6		C				am 3
   615  	pmuludq	%mm7, %mm3		C				am 3
   616  	paddq	%mm5, %mm6		C				am 3
   617  	movd	16(%eax), %mm0		C				am 3
   618  	paddq	%mm2, %mm4		C				am 3
   619  	movd	12(%edx), %mm5		C				am 3
   620  	movd	%mm6, 4(%edx)		C				am 3
   621  	psrlq	$32, %mm6		C				am 3
   622  	pmuludq	%mm7, %mm0		C				am 3
   623  	paddq	%mm4, %mm6		C				am 3
   624  	movd	20(%eax), %mm1		C				am 3
   625  	paddq	%mm3, %mm5		C				am 3
   626  	movd	16(%edx), %mm4		C				am 3
   627  	movd	%mm6, 8(%edx)		C				am 3
   628  	psrlq	$32, %mm6		C				am 3
   629  	pmuludq	%mm7, %mm1		C				am 3
   630  	paddq	%mm5, %mm6		C				am 3
   631  	movd	24(%eax), %mm2		C				am 3
   632  	paddq	%mm0, %mm4		C				am 3
   633  	movd	20(%edx), %mm5		C				am 3
   634  	movd	%mm6, 12(%edx)		C				am 3
   635  	psrlq	$32, %mm6		C				am 3
   636  	lea	16(%eax), %eax		C				am 3
   637  	lea	16(%edx), %edx		C				am 3
   638  	add	$4, %ecx		C				am 3
   639  	jnz	L(lam3)			C				am 3
   640  	pmuludq	%mm7, %mm2		C				am 3
   641  	paddq	%mm4, %mm6		C				am 3
   642  	paddq	%mm1, %mm5		C				am 3
   643  	movd	8(%edx), %mm4		C				am 3
   644  	movd	%mm6, (%edx)		C				am 3
   645  	psrlq	$32, %mm6		C				am 3
   646  	paddq	%mm5, %mm6		C				am 3
   647  	paddq	%mm2, %mm4		C				am 3
   648  L(x3):	movd	%mm6, 4(%edx)		C				am 3
   649  	psrlq	$32, %mm6		C				am 3
   650  	paddq	%mm4, %mm6		C				am 3
   651  	movd	%mm6, 8(%edx)		C				am 3
   652  	psrlq	$32, %mm6		C				am 3
   653  	movd	%mm6, 12(%edx)		C				am 3
   654  	dec	%ebx			C				am 3
   655  	jnz	L(olp3)			C				am 3
   656  L(oel3):
   657  	emms				C				   3
   658  	pop	%edi			C				   3
   659  	pop	%ebx			C				   3
   660  	pop	%esi			C				   3
   661  	ret				C				   3
   662  EPILOGUE()