github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium4/sse2/sqr_basecase.asm (about)

     1  dnl  mpn_sqr_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
     2  
     3  dnl  Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  C TODO:
    34  C  * Improve ad-hoc outer loop code and register handling.  Some feed-in
    35  C    scheduling could improve things by several cycles per outer iteration.
    36  C  * In Lam3...Lam1 code for, keep accumulation operands in registers, without
    37  C    storing intermediates to rp.
    38  C  * We might want to keep 32 in a free mm register, since the register form is
    39  C    3 bytes and the immediate form is 4 bytes.  About 80 bytes to save.
    40  C  * Look into different loop alignment, we now expand the code about 50 bytes
    41  C    with possibly needless alignment.
    42  C  * Use OSP, should solve feed-in latency problems.
    43  C  * Address relative slowness for un<=3 for Pentium M.  The old code is there
    44  C    considerably faster.  (1:20/14, 2:34:32, 3:66/57)
    45  
    46  C INPUT PARAMETERS
    47  C rp		sp + 4
    48  C up		sp + 8
    49  C un		sp + 12
    50  
    51  	TEXT
    52  	ALIGN(16)
    53  PROLOGUE(mpn_sqr_basecase)
    54  	mov	4(%esp), %edx		C rp
    55  	mov	8(%esp), %eax		C up
    56  	mov	12(%esp), %ecx		C un
    57  
    58  	cmp	$2, %ecx
    59  	jc	L(un1)
    60  	jz	L(un2)
    61  	cmp	$4, %ecx
    62  	jc	L(un3)
    63  	jz	L(un4)
    64  	jmp	L(big)
    65  
    66  L(un1):	mov	(%eax), %eax
    67  	mov	%edx, %ecx
    68  	mul	%eax
    69  	mov	%eax, (%ecx)
    70  	mov	%edx, 4(%ecx)
    71  	ret
    72  L(un2):	movd	(%eax), %mm0		C				un=2
    73  	movd	(%eax), %mm2		C				un=2
    74  	movd	4(%eax), %mm1		C				un=2
    75  	pmuludq	%mm0, %mm0		C 64b weight 0			un=2
    76  	pmuludq	%mm1, %mm2		C 64b weight 32			un=2
    77  	pmuludq	%mm1, %mm1		C 64b weight 64			un=2
    78  	movd	%mm0, (%edx)		C				un=2
    79  	psrlq	$32, %mm0		C 32b weight 32			un=2
    80  	pcmpeqd	%mm7, %mm7		C				un=2
    81  	psrlq	$33, %mm7		C 0x000000007FFFFFFF		un=2
    82  	pand	%mm2, %mm7		C 31b weight 32			un=2
    83  	psrlq	$31, %mm2		C 33b weight 65			un=2
    84  	psllq	$1, %mm7		C 31b weight 33			un=2
    85  	paddq	%mm7, %mm0		C				un=2
    86  	movd	%mm0, 4(%edx)		C				un=2
    87  	psrlq	$32, %mm0		C				un=2
    88  	paddq	%mm2, %mm1		C				un=2
    89  	paddq	%mm0, %mm1		C				un=2
    90  	movd	%mm1, 8(%edx)		C				un=2
    91  	psrlq	$32, %mm1		C				un=2
    92  	movd	%mm1, 12(%edx)		C				un=2
    93  	emms
    94  	ret
    95  L(un3):	movd	(%eax), %mm7		C				un=3
    96  	movd	4(%eax), %mm6		C				un=3
    97  	pmuludq	%mm7, %mm6		C				un=3
    98  	movd	8(%eax), %mm2		C				un=3
    99  	pmuludq	%mm7, %mm2		C				un=3
   100  	movd	%mm6, 4(%edx)		C				un=3
   101  	psrlq	$32, %mm6		C				un=3
   102  	paddq	%mm2, %mm6		C				un=3
   103  	movd	%mm6, 8(%edx)		C				un=3
   104  	psrlq	$32, %mm6		C				un=3
   105  	movd	%mm6, 12(%edx)		C				un=3
   106  	lea	4(%edx), %edx		C				un=3
   107  	lea	4(%eax), %eax		C				un=3
   108  	jmp	L(am1)
   109  L(un4):	movd	(%eax), %mm7		C				un=4
   110  	movd	4(%eax), %mm6		C				un=4
   111  	pmuludq	%mm7, %mm6		C				un=4
   112  	movd	8(%eax), %mm0		C				un=4
   113  	pmuludq	%mm7, %mm0		C				un=4
   114  	movd	12(%eax), %mm1		C				un=4
   115  	pmuludq	%mm7, %mm1		C				un=4
   116  	movd	%mm6, 4(%edx)		C				un=4
   117  	psrlq	$32, %mm6		C				un=4
   118  	paddq	%mm0, %mm6		C				un=4
   119  	movd	%mm6, 8(%edx)		C				un=4
   120  	psrlq	$32, %mm6		C				un=4
   121  	paddq	%mm1, %mm6		C				un=4
   122  	movd	%mm6, 12(%edx)		C				un=4
   123  	psrlq	$32, %mm6		C				un=4
   124  	movd	%mm6, 16(%edx)		C				un=4
   125  	lea	4(%edx), %edx		C				un=4
   126  	lea	4(%eax), %eax		C				un=4
   127  	jmp	L(am2)
   128  
   129  L(big):	push	%esi
   130  	push	%ebx
   131  	push	%edi
   132  	pxor	%mm6, %mm6
   133  	movd	(%eax), %mm7		C
   134  	lea	4(%eax), %esi		C init up, up++
   135  	lea	4(%eax), %eax		C up2++  FIXME: should fix offsets
   136  	lea	4(%edx), %edi		C init rp, rp++
   137  	lea	4(%edx), %edx		C rp2++
   138  	lea	-4(%ecx), %ebx		C loop count
   139  	and	$3, %ecx
   140  	jz	L(3m)
   141  	cmp	$2, %ecx
   142  	ja	L(2m)
   143  	jb	L(0m)
   144  
   145  L(1m):
   146  	movd	(%eax), %mm4		C				m 1
   147  	lea	(%ebx), %ecx		C inner loop count		m 1
   148  	pmuludq	%mm7, %mm4		C				m 1
   149  	movd	4(%eax), %mm3		C				m 1
   150  	pmuludq	%mm7, %mm3		C				m 1
   151  	movd	8(%eax), %mm0		C				m 1
   152  	jmp	L(m01)			C				m 1
   153  	ALIGN(16)			C				m 1
   154  L(lpm1):
   155  	pmuludq	%mm7, %mm4		C				m 1
   156  	paddq	%mm0, %mm6		C				m 1
   157  	movd	4(%eax), %mm3		C				m 1
   158  	movd	%mm6, -8(%edx)		C				m 1
   159  	psrlq	$32, %mm6		C				m 1
   160  	pmuludq	%mm7, %mm3		C				m 1
   161  	paddq	%mm1, %mm6		C				m 1
   162  	movd	8(%eax), %mm0		C				m 1
   163  	movd	%mm6, -4(%edx)		C				m 1
   164  	psrlq	$32, %mm6		C				m 1
   165  L(m01):	pmuludq	%mm7, %mm0		C				m 1
   166  	paddq	%mm4, %mm6		C				m 1
   167  	movd	12(%eax), %mm1		C				m 1
   168  	movd	%mm6, (%edx)		C				m 1
   169  	psrlq	$32, %mm6		C				m 1
   170  	pmuludq	%mm7, %mm1		C				m 1
   171  	paddq	%mm3, %mm6		C				m 1
   172  	movd	16(%eax), %mm4		C				m 1
   173  	movd	%mm6, 4(%edx)		C				m 1
   174  	psrlq	$32, %mm6		C				m 1
   175  	lea	16(%eax), %eax		C				m 1
   176  	lea	16(%edx), %edx		C				m 1
   177  	sub	$4, %ecx		C				m 1
   178  	ja	L(lpm1)			C				m 1
   179  	pmuludq	%mm7, %mm4		C				m 1
   180  	paddq	%mm0, %mm6		C				m 1
   181  	movd	%mm6, -8(%edx)		C				m 1
   182  	psrlq	$32, %mm6		C				m 1
   183  	paddq	%mm1, %mm6		C				m 1
   184  	jmp	L(0)
   185  
   186  L(2m):
   187  	movd	(%eax), %mm1		C				m 2
   188  	lea	(%ebx), %ecx		C inner loop count		m 2
   189  	pmuludq	%mm7, %mm1		C				m 2
   190  	movd	4(%eax), %mm4		C				m 2
   191  	pmuludq	%mm7, %mm4		C				m 2
   192  	movd	8(%eax), %mm3		C				m 2
   193  	jmp	L(m10)			C				m 2
   194  	ALIGN(16)			C				m 2
   195  L(lpm2):
   196  	pmuludq	%mm7, %mm4		C				m 2
   197  	paddq	%mm0, %mm6		C				m 2
   198  	movd	8(%eax), %mm3		C				m 2
   199  	movd	%mm6, -4(%edx)		C				m 2
   200  	psrlq	$32, %mm6		C				m 2
   201  L(m10):	pmuludq	%mm7, %mm3		C				m 2
   202  	paddq	%mm1, %mm6		C				m 2
   203  	movd	12(%eax), %mm0		C				m 2
   204  	movd	%mm6, (%edx)		C				m 2
   205  	psrlq	$32, %mm6		C				m 2
   206  	pmuludq	%mm7, %mm0		C				m 2
   207  	paddq	%mm4, %mm6		C				m 2
   208  	movd	16(%eax), %mm1		C				m 2
   209  	movd	%mm6, 4(%edx)		C				m 2
   210  	psrlq	$32, %mm6		C				m 2
   211  	pmuludq	%mm7, %mm1		C				m 2
   212  	paddq	%mm3, %mm6		C				m 2
   213  	movd	20(%eax), %mm4		C				m 2
   214  	movd	%mm6, 8(%edx)		C				m 2
   215  	psrlq	$32, %mm6		C				m 2
   216  	lea	16(%eax), %eax		C				m 2
   217  	lea	16(%edx), %edx		C				m 2
   218  	sub	$4, %ecx		C				m 2
   219  	ja	L(lpm2)			C				m 2
   220  	pmuludq	%mm7, %mm4		C				m 2
   221  	paddq	%mm0, %mm6		C				m 2
   222  	movd	%mm6, -4(%edx)		C				m 2
   223  	psrlq	$32, %mm6		C				m 2
   224  	paddq	%mm1, %mm6		C				m 2
   225  	jmp	L(1)
   226  
   227  L(3m):
   228  	movd	(%eax), %mm0		C				m 3
   229  	lea	(%ebx), %ecx		C inner loop count		m 3
   230  	pmuludq	%mm7, %mm0		C				m 3
   231  	movd	4(%eax), %mm1		C				m 3
   232  	pmuludq	%mm7, %mm1		C				m 3
   233  	movd	8(%eax), %mm4		C				m 3
   234  	jmp	L(lpm3)			C				m 3
   235  	ALIGN(16)			C				m 3
   236  L(lpm3):
   237  	pmuludq	%mm7, %mm4		C				m 3
   238  	paddq	%mm0, %mm6		C				m 3
   239  	movd	12(%eax), %mm3		C				m 3
   240  	movd	%mm6, (%edx)		C				m 3
   241  	psrlq	$32, %mm6		C				m 3
   242  	pmuludq	%mm7, %mm3		C				m 3
   243  	paddq	%mm1, %mm6		C				m 3
   244  	movd	16(%eax), %mm0		C				m 3
   245  	movd	%mm6, 4(%edx)		C				m 3
   246  	psrlq	$32, %mm6		C				m 3
   247  	pmuludq	%mm7, %mm0		C				m 3
   248  	paddq	%mm4, %mm6		C				m 3
   249  	movd	20(%eax), %mm1		C				m 3
   250  	movd	%mm6, 8(%edx)		C				m 3
   251  	psrlq	$32, %mm6		C				m 3
   252  	pmuludq	%mm7, %mm1		C				m 3
   253  	paddq	%mm3, %mm6		C				m 3
   254  	movd	24(%eax), %mm4		C				m 3
   255  	movd	%mm6, 12(%edx)		C				m 3
   256  	psrlq	$32, %mm6		C				m 3
   257  	lea	16(%eax), %eax		C				m 3
   258  	lea	16(%edx), %edx		C				m 3
   259  	sub	$4, %ecx		C				m 3
   260  	ja	L(lpm3)			C				m 3
   261  	pmuludq	%mm7, %mm4		C				m 3
   262  	paddq	%mm0, %mm6		C				m 3
   263  	movd	%mm6, (%edx)		C				m 3
   264  	psrlq	$32, %mm6		C				m 3
   265  	paddq	%mm1, %mm6		C				m 3
   266  	jmp	L(2)
   267  
   268  L(0m):
   269  	movd	(%eax), %mm3		C				m 0
   270  	lea	(%ebx), %ecx		C inner loop count		m 0
   271  	pmuludq	%mm7, %mm3		C				m 0
   272  	movd	4(%eax), %mm0		C				m 0
   273  	pmuludq	%mm7, %mm0		C				m 0
   274  	movd	8(%eax), %mm1		C				m 0
   275  	jmp	L(m00)			C				m 0
   276  	ALIGN(16)			C				m 0
   277  L(lpm0):
   278  	pmuludq	%mm7, %mm4		C				m 0
   279  	paddq	%mm0, %mm6		C				m 0
   280  	movd	(%eax), %mm3		C				m 0
   281  	movd	%mm6, -12(%edx)		C				m 0
   282  	psrlq	$32, %mm6		C				m 0
   283  	pmuludq	%mm7, %mm3		C				m 0
   284  	paddq	%mm1, %mm6		C				m 0
   285  	movd	4(%eax), %mm0		C				m 0
   286  	movd	%mm6, -8(%edx)		C				m 0
   287  	psrlq	$32, %mm6		C				m 0
   288  	pmuludq	%mm7, %mm0		C				m 0
   289  	paddq	%mm4, %mm6		C				m 0
   290  	movd	8(%eax), %mm1		C				m 0
   291  	movd	%mm6, -4(%edx)		C				m 0
   292  	psrlq	$32, %mm6		C				m 0
   293  L(m00):	pmuludq	%mm7, %mm1		C				m 0
   294  	paddq	%mm3, %mm6		C				m 0
   295  	movd	12(%eax), %mm4		C				m 0
   296  	movd	%mm6, (%edx)		C				m 0
   297  	psrlq	$32, %mm6		C				m 0
   298  	lea	16(%eax), %eax		C				m 0
   299  	lea	16(%edx), %edx		C				m 0
   300  	sub	$4, %ecx		C				m 0
   301  	ja	L(lpm0)			C				m 0
   302  	pmuludq	%mm7, %mm4		C				m 0
   303  	paddq	%mm0, %mm6		C				m 0
   304  	movd	%mm6, -12(%edx)		C				m 0
   305  	psrlq	$32, %mm6		C				m 0
   306  	paddq	%mm1, %mm6		C				m 0
   307  	jmp	L(3)
   308  
   309  L(outer):
   310  	lea	8(%edi), %edi		C rp += 2
   311  	movd	(%esi), %mm7		C				am 3
   312  	mov	%edi, %edx		C rp2 = rp			am 3
   313  	lea	4(%esi), %esi		C up++				am 3
   314  	lea	(%esi), %eax		C up2 = up			am 3
   315  	movd	(%eax), %mm0		C				am 3
   316  	lea	(%ebx), %ecx		C inner loop count		am 3
   317  	pxor	%mm6, %mm6		C				am 3
   318  	pmuludq	%mm7, %mm0		C				am 3
   319  	movd	4(%eax), %mm1		C				am 3
   320  	movd	(%edx), %mm4		C				am 3
   321  	pmuludq	%mm7, %mm1		C				am 3
   322  	movd	8(%eax), %mm2		C				am 3
   323  	paddq	%mm0, %mm4		C				am 3
   324  	movd	4(%edx), %mm5		C				am 3
   325  	jmp	L(lam3)			C				am 3
   326  	ALIGN(16)			C				am 3
   327  L(lam3):
   328  	pmuludq	%mm7, %mm2		C				am 3
   329  	paddq	%mm4, %mm6		C				am 3
   330  	movd	12(%eax), %mm3		C				am 3
   331  	paddq	%mm1, %mm5		C				am 3
   332  	movd	8(%edx), %mm4		C				am 3
   333  	movd	%mm6, (%edx)		C				am 3
   334  	psrlq	$32, %mm6		C				am 3
   335  	pmuludq	%mm7, %mm3		C				am 3
   336  	paddq	%mm5, %mm6		C				am 3
   337  	movd	16(%eax), %mm0		C				am 3
   338  	paddq	%mm2, %mm4		C				am 3
   339  	movd	12(%edx), %mm5		C				am 3
   340  	movd	%mm6, 4(%edx)		C				am 3
   341  	psrlq	$32, %mm6		C				am 3
   342  	pmuludq	%mm7, %mm0		C				am 3
   343  	paddq	%mm4, %mm6		C				am 3
   344  	movd	20(%eax), %mm1		C				am 3
   345  	paddq	%mm3, %mm5		C				am 3
   346  	movd	16(%edx), %mm4		C				am 3
   347  	movd	%mm6, 8(%edx)		C				am 3
   348  	psrlq	$32, %mm6		C				am 3
   349  	pmuludq	%mm7, %mm1		C				am 3
   350  	paddq	%mm5, %mm6		C				am 3
   351  	movd	24(%eax), %mm2		C				am 3
   352  	paddq	%mm0, %mm4		C				am 3
   353  	movd	20(%edx), %mm5		C				am 3
   354  	movd	%mm6, 12(%edx)		C				am 3
   355  	psrlq	$32, %mm6		C				am 3
   356  	lea	16(%eax), %eax		C				am 3
   357  	lea	16(%edx), %edx		C				am 3
   358  	sub	$4, %ecx		C				am 3
   359  	ja	L(lam3)			C				am 3
   360  	pmuludq	%mm7, %mm2		C				am 3
   361  	paddq	%mm4, %mm6		C				am 3
   362  	paddq	%mm1, %mm5		C				am 3
   363  	movd	8(%edx), %mm4		C				am 3
   364  	movd	%mm6, (%edx)		C				am 3
   365  	psrlq	$32, %mm6		C				am 3
   366  	paddq	%mm5, %mm6		C				am 3
   367  	paddq	%mm2, %mm4		C				am 3
   368  L(2):	movd	%mm6, 4(%edx)		C				am 3
   369  	psrlq	$32, %mm6		C				am 3
   370  	paddq	%mm4, %mm6		C				am 3
   371  	movd	%mm6, 8(%edx)		C				am 3
   372  	psrlq	$32, %mm6		C				am 3
   373  	movd	%mm6, 12(%edx)		C				am 3
   374  
   375  	lea	8(%edi), %edi		C rp += 2
   376  	movd	(%esi), %mm7		C				am 2
   377  	mov	%edi, %edx		C rp2 = rp			am 2
   378  	lea	4(%esi), %esi		C up++				am 2
   379  	lea	(%esi), %eax		C up2 = up			am 2
   380  	movd	(%eax), %mm1		C				am 2
   381  	lea	(%ebx), %ecx		C inner loop count		am 2
   382  	pxor	%mm6, %mm6		C				am 2
   383  	pmuludq	%mm7, %mm1		C				am 2
   384  	movd	4(%eax), %mm2		C				am 2
   385  	movd	(%edx), %mm5		C				am 2
   386  	pmuludq	%mm7, %mm2		C				am 2
   387  	movd	8(%eax), %mm3		C				am 2
   388  	paddq	%mm1, %mm5		C				am 2
   389  	movd	4(%edx), %mm4		C				am 2
   390  	jmp	L(am10)			C				am 2
   391  	ALIGN(16)			C				am 2
   392  L(lam2):
   393  	pmuludq	%mm7, %mm2		C				am 2
   394  	paddq	%mm4, %mm6		C				am 2
   395  	movd	8(%eax), %mm3		C				am 2
   396  	paddq	%mm1, %mm5		C				am 2
   397  	movd	4(%edx), %mm4		C				am 2
   398  	movd	%mm6, -4(%edx)		C				am 2
   399  	psrlq	$32, %mm6		C				am 2
   400  L(am10):
   401  	pmuludq	%mm7, %mm3		C				am 2
   402  	paddq	%mm5, %mm6		C				am 2
   403  	movd	12(%eax), %mm0		C				am 2
   404  	paddq	%mm2, %mm4		C				am 2
   405  	movd	8(%edx), %mm5		C				am 2
   406  	movd	%mm6, (%edx)		C				am 2
   407  	psrlq	$32, %mm6		C				am 2
   408  	pmuludq	%mm7, %mm0		C				am 2
   409  	paddq	%mm4, %mm6		C				am 2
   410  	movd	16(%eax), %mm1		C				am 2
   411  	paddq	%mm3, %mm5		C				am 2
   412  	movd	12(%edx), %mm4		C				am 2
   413  	movd	%mm6, 4(%edx)		C				am 2
   414  	psrlq	$32, %mm6		C				am 2
   415  	pmuludq	%mm7, %mm1		C				am 2
   416  	paddq	%mm5, %mm6		C				am 2
   417  	movd	20(%eax), %mm2		C				am 2
   418  	paddq	%mm0, %mm4		C				am 2
   419  	movd	16(%edx), %mm5		C				am 2
   420  	movd	%mm6, 8(%edx)		C				am 2
   421  	psrlq	$32, %mm6		C				am 2
   422  	lea	16(%eax), %eax		C				am 2
   423  	lea	16(%edx), %edx		C				am 2
   424  	sub	$4, %ecx		C				am 2
   425  	ja	L(lam2)			C				am 2
   426  	pmuludq	%mm7, %mm2		C				am 2
   427  	paddq	%mm4, %mm6		C				am 2
   428  	paddq	%mm1, %mm5		C				am 2
   429  	movd	4(%edx), %mm4		C				am 2
   430  	movd	%mm6, -4(%edx)		C				am 2
   431  	psrlq	$32, %mm6		C				am 2
   432  	paddq	%mm5, %mm6		C				am 2
   433  	paddq	%mm2, %mm4		C				am 2
   434  L(1):	movd	%mm6, (%edx)		C				am 2
   435  	psrlq	$32, %mm6		C				am 2
   436  	paddq	%mm4, %mm6		C				am 2
   437  	movd	%mm6, 4(%edx)		C				am 2
   438  	psrlq	$32, %mm6		C				am 2
   439  	movd	%mm6, 8(%edx)		C				am 2
   440  
   441  	lea	8(%edi), %edi		C rp += 2
   442  	movd	(%esi), %mm7		C				am 1
   443  	mov	%edi, %edx		C rp2 = rp			am 1
   444  	lea	4(%esi), %esi		C up++				am 1
   445  	lea	(%esi), %eax		C up2 = up			am 1
   446  	movd	(%eax), %mm2		C				am 1
   447  	lea	(%ebx), %ecx		C inner loop count		am 1
   448  	pxor	%mm6, %mm6		C				am 1
   449  	pmuludq	%mm7, %mm2		C				am 1
   450  	movd	4(%eax), %mm3		C				am 1
   451  	movd	(%edx), %mm4		C				am 1
   452  	pmuludq	%mm7, %mm3		C				am 1
   453  	movd	8(%eax), %mm0		C				am 1
   454  	paddq	%mm2, %mm4		C				am 1
   455  	movd	4(%edx), %mm5		C				am 1
   456  	jmp	L(am01)			C				am 1
   457  	ALIGN(16)			C				am 1
   458  L(lam1):
   459  	pmuludq	%mm7, %mm2		C				am 1
   460  	paddq	%mm4, %mm6		C				am 1
   461  	movd	4(%eax), %mm3		C				am 1
   462  	paddq	%mm1, %mm5		C				am 1
   463  	movd	(%edx), %mm4		C				am 1
   464  	movd	%mm6, -8(%edx)		C				am 1
   465  	psrlq	$32, %mm6		C				am 1
   466  	pmuludq	%mm7, %mm3		C				am 1
   467  	paddq	%mm5, %mm6		C				am 1
   468  	movd	8(%eax), %mm0		C				am 1
   469  	paddq	%mm2, %mm4		C				am 1
   470  	movd	4(%edx), %mm5		C				am 1
   471  	movd	%mm6, -4(%edx)		C				am 1
   472  	psrlq	$32, %mm6		C				am 1
   473  L(am01):
   474  	pmuludq	%mm7, %mm0		C				am 1
   475  	paddq	%mm4, %mm6		C				am 1
   476  	movd	12(%eax), %mm1		C				am 1
   477  	paddq	%mm3, %mm5		C				am 1
   478  	movd	8(%edx), %mm4		C				am 1
   479  	movd	%mm6, (%edx)		C				am 1
   480  	psrlq	$32, %mm6		C				am 1
   481  	pmuludq	%mm7, %mm1		C				am 1
   482  	paddq	%mm5, %mm6		C				am 1
   483  	movd	16(%eax), %mm2		C				am 1
   484  	paddq	%mm0, %mm4		C				am 1
   485  	movd	12(%edx), %mm5		C				am 1
   486  	movd	%mm6, 4(%edx)		C				am 1
   487  	psrlq	$32, %mm6		C				am 1
   488  	lea	16(%eax), %eax		C				am 1
   489  	lea	16(%edx), %edx		C				am 1
   490  	sub	$4, %ecx		C				am 1
   491  	ja	L(lam1)			C				am 1
   492  	pmuludq	%mm7, %mm2		C				am 1
   493  	paddq	%mm4, %mm6		C				am 1
   494  	paddq	%mm1, %mm5		C				am 1
   495  	movd	(%edx), %mm4		C				am 1
   496  	movd	%mm6, -8(%edx)		C				am 1
   497  	psrlq	$32, %mm6		C				am 1
   498  	paddq	%mm5, %mm6		C				am 1
   499  	paddq	%mm2, %mm4		C				am 1
   500  L(0):	movd	%mm6, -4(%edx)		C				am 1
   501  	psrlq	$32, %mm6		C				am 1
   502  	paddq	%mm4, %mm6		C				am 1
   503  	movd	%mm6, (%edx)		C				am 1
   504  	psrlq	$32, %mm6		C				am 1
   505  	movd	%mm6, 4(%edx)		C				am 1
   506  
   507  	lea	8(%edi), %edi		C rp += 2
   508  	movd	(%esi), %mm7		C				am 0
   509  	mov	%edi, %edx		C rp2 = rp			am 0
   510  	lea	4(%esi), %esi		C up++				am 0
   511  	lea	(%esi), %eax		C up2 = up			am 0
   512  	movd	(%eax), %mm3		C				am 0
   513  	lea	(%ebx), %ecx		C inner loop count		am 0
   514  	pxor	%mm6, %mm6		C				am 0
   515  	pmuludq	%mm7, %mm3		C				am 0
   516  	movd	4(%eax), %mm0		C				am 0
   517  	movd	(%edx), %mm5		C				am 0
   518  	pmuludq	%mm7, %mm0		C				am 0
   519  	movd	8(%eax), %mm1		C				am 0
   520  	paddq	%mm3, %mm5		C				am 0
   521  	movd	4(%edx), %mm4		C				am 0
   522  	jmp	L(am00)			C				am 0
   523  	ALIGN(16)			C				am 0
   524  L(lam0):
   525  	pmuludq	%mm7, %mm2		C				am 0
   526  	paddq	%mm4, %mm6		C				am 0
   527  	movd	(%eax), %mm3		C				am 0
   528  	paddq	%mm1, %mm5		C				am 0
   529  	movd	-4(%edx), %mm4		C				am 0
   530  	movd	%mm6, -12(%edx)		C				am 0
   531  	psrlq	$32, %mm6		C				am 0
   532  	pmuludq	%mm7, %mm3		C				am 0
   533  	paddq	%mm5, %mm6		C				am 0
   534  	movd	4(%eax), %mm0		C				am 0
   535  	paddq	%mm2, %mm4		C				am 0
   536  	movd	(%edx), %mm5		C				am 0
   537  	movd	%mm6, -8(%edx)		C				am 0
   538  	psrlq	$32, %mm6		C				am 0
   539  	pmuludq	%mm7, %mm0		C				am 0
   540  	paddq	%mm4, %mm6		C				am 0
   541  	movd	8(%eax), %mm1		C				am 0
   542  	paddq	%mm3, %mm5		C				am 0
   543  	movd	4(%edx), %mm4		C				am 0
   544  	movd	%mm6, -4(%edx)		C				am 0
   545  	psrlq	$32, %mm6		C				am 0
   546  L(am00):
   547  	pmuludq	%mm7, %mm1		C				am 0
   548  	paddq	%mm5, %mm6		C				am 0
   549  	movd	12(%eax), %mm2		C				am 0
   550  	paddq	%mm0, %mm4		C				am 0
   551  	movd	8(%edx), %mm5		C				am 0
   552  	movd	%mm6, (%edx)		C				am 0
   553  	psrlq	$32, %mm6		C				am 0
   554  	lea	16(%eax), %eax		C				am 0
   555  	lea	16(%edx), %edx		C				am 0
   556  	sub	$4, %ecx		C				am 0
   557  	ja	L(lam0)			C				am 0
   558  	pmuludq	%mm7, %mm2		C				am 0
   559  	paddq	%mm4, %mm6		C				am 0
   560  	paddq	%mm1, %mm5		C				am 0
   561  	movd	-4(%edx), %mm4		C				am 0
   562  	movd	%mm6, -12(%edx)		C				am 0
   563  	psrlq	$32, %mm6		C				am 0
   564  	paddq	%mm5, %mm6		C				am 0
   565  	paddq	%mm2, %mm4		C				am 0
   566  L(3):	movd	%mm6, -8(%edx)		C				am 0
   567  	psrlq	$32, %mm6		C				am 0
   568  	paddq	%mm4, %mm6		C				am 0
   569  	movd	%mm6, -4(%edx)		C				am 0
   570  	psrlq	$32, %mm6		C				am 0
   571  	movd	%mm6, (%edx)		C				am 0
   572  	sub	$4, %ebx		C				am 0
   573  	ja	L(outer)			C				am 0
   574  
   575  	mov	%edi, %edx
   576  	mov	%esi, %eax
   577  	pop	%edi
   578  	pop	%ebx
   579  	pop	%esi
   580  
   581  L(am3):	C up[un-1..un-3] x up[un-4]
   582  	lea	8(%edx), %edx		C rp2 += 2
   583  	movd	(%eax), %mm7
   584  	movd	4(%eax), %mm1
   585  	movd	8(%eax), %mm2
   586  	movd	12(%eax), %mm3
   587  	movd	(%edx), %mm4
   588  	pmuludq	%mm7, %mm1
   589  	movd	4(%edx), %mm5
   590  	pmuludq	%mm7, %mm2
   591  	movd	8(%edx), %mm6
   592  	pmuludq	%mm7, %mm3
   593  	paddq	%mm1, %mm4
   594  	paddq	%mm2, %mm5
   595  	paddq	%mm3, %mm6
   596  	movd	%mm4, (%edx)
   597  	psrlq	$32, %mm4
   598  	paddq	%mm5, %mm4
   599  	movd	%mm4, 4(%edx)
   600  	psrlq	$32, %mm4
   601  	paddq	%mm6, %mm4
   602  	movd	%mm4, 8(%edx)
   603  	psrlq	$32, %mm4
   604  	movd	%mm4, 12(%edx)		C FIXME feed through!
   605  	lea	4(%eax), %eax
   606  
   607  L(am2):	C up[un-1..un-2] x up[un-3]
   608  	lea	8(%edx), %edx		C rp2 += 2
   609  	movd	(%eax), %mm7
   610  	movd	4(%eax), %mm1
   611  	movd	8(%eax), %mm2
   612  	movd	(%edx), %mm4
   613  	movd	4(%edx), %mm5
   614  	pmuludq	%mm7, %mm1
   615  	pmuludq	%mm7, %mm2
   616  	paddq	%mm1, %mm4
   617  	paddq	%mm2, %mm5
   618  	movd	%mm4, (%edx)
   619  	psrlq	$32, %mm4
   620  	paddq	%mm5, %mm4
   621  	movd	%mm4, 4(%edx)
   622  	psrlq	$32, %mm4
   623  	movd	%mm4, 8(%edx)		C FIXME feed through!
   624  	lea	4(%eax), %eax
   625  
   626  L(am1):	C up[un-1] x up[un-2]
   627  	lea	8(%edx), %edx		C rp2 += 2
   628  	movd	(%eax), %mm7
   629  	movd	4(%eax), %mm2
   630  	movd	(%edx), %mm4
   631  	pmuludq	%mm7, %mm2
   632  	paddq	%mm2, %mm4
   633  	movd	%mm4, (%edx)
   634  	psrlq	$32, %mm4
   635  	movd	%mm4, 4(%edx)
   636  
   637  C *** diag stuff, use elementary code for now
   638  
   639  	mov	4(%esp), %edx		C rp
   640  	mov	8(%esp), %eax		C up
   641  	mov	12(%esp), %ecx		C un
   642  
   643  	movd	(%eax), %mm2
   644  	pmuludq	%mm2, %mm2		C src[0]^2
   645  
   646  	pcmpeqd	%mm7, %mm7
   647  	psrlq	$32, %mm7
   648  
   649  	movd	4(%edx), %mm3		C dst[1]
   650  
   651  	movd	%mm2, (%edx)
   652  	psrlq	$32, %mm2
   653  
   654  	psllq	$1, %mm3		C 2*dst[1]
   655  	paddq	%mm3, %mm2
   656  	movd	%mm2, 4(%edx)
   657  	psrlq	$32, %mm2
   658  
   659  	sub	$2, %ecx
   660  
   661  L(diag):
   662  	movd	4(%eax), %mm0		C src limb
   663  	add	$4, %eax
   664  	pmuludq	%mm0, %mm0
   665  	movq	%mm7, %mm1
   666  	pand	%mm0, %mm1		C diagonal low
   667  	psrlq	$32, %mm0		C diagonal high
   668  
   669  	movd	8(%edx), %mm3
   670  	psllq	$1, %mm3		C 2*dst[i]
   671  	paddq	%mm3, %mm1
   672  	paddq	%mm1, %mm2
   673  	movd	%mm2, 8(%edx)
   674  	psrlq	$32, %mm2
   675  
   676  	movd	12(%edx), %mm3
   677  	psllq	$1, %mm3		C 2*dst[i+1]
   678  	paddq	%mm3, %mm0
   679  	paddq	%mm0, %mm2
   680  	movd	%mm2, 12(%edx)
   681  	add	$8, %edx
   682  	psrlq	$32, %mm2
   683  
   684  	sub	$1, %ecx
   685  	jnz	L(diag)
   686  
   687  	movd	4(%eax), %mm0		C src[size-1]
   688  	pmuludq	%mm0, %mm0
   689  	pand	%mm0, %mm7		C diagonal low
   690  	psrlq	$32, %mm0		C diagonal high
   691  
   692  	movd	8(%edx), %mm3		C dst[2*size-2]
   693  	psllq	$1, %mm3
   694  	paddq	%mm3, %mm7
   695  	paddq	%mm7, %mm2
   696  	movd	%mm2, 8(%edx)
   697  	psrlq	$32, %mm2
   698  
   699  	paddq	%mm0, %mm2
   700  	movd	%mm2, 12(%edx)		C dst[2*size-1]
   701  
   702  	emms
   703  	ret
   704  
   705  EPILOGUE()