github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/k8/sqr_basecase.asm (about)

     1  dnl  AMD64 mpn_sqr_basecase.
     2  
     3  dnl  Contributed to the GNU project by Torbjorn Granlund.
     4  
     5  dnl  Copyright 2008, 2009, 2011, 2012 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  C The inner loops of this code are the result of running a code generation and
    36  C optimization tool suite written by David Harvey and Torbjorn Granlund.
    37  
    38  C NOTES
    39  C   * There is a major stupidity in that we call mpn_mul_1 initially, for a
    40  C     large trip count.  Instead, we should follow the generic/sqr_basecase.c
    41  C     code which uses addmul_2s from the start, conditionally leaving a 1x1
    42  C     multiply to the end.  (In assembly code, one would stop invoking
    43  C     addmul_2s loops when perhaps 3x2s respectively a 2x2s remains.)
    44  C   * Another stupidity is in the sqr_diag_addlsh1 code.  It does not need to
    45  C     save/restore carry, instead it can propagate into the high product word.
    46  C   * Align more labels, should shave off a few cycles.
    47  C   * We can safely use 32-bit size operations, since operands with (2^32)
    48  C     limbs will lead to non-termination in practice.
    49  C   * The jump table could probably be optimized, at least for non-pic.
    50  C   * The special code for n <= 4 was quickly written.  It is probably too
    51  C     large and unnecessarily slow.
    52  C   * Consider combining small cases code so that the n=k-1 code jumps into the
    53  C     middle of the n=k code.
    54  C   * Avoid saving registers for small cases code.
    55  C   * Needed variables:
    56  C    n   r11  input size
    57  C    i   r8   work left, initially n
    58  C    j   r9   inner loop count
    59  C        r15  unused
    60  C    v0  r13
    61  C    v1  r14
    62  C    rp  rdi
    63  C    up  rsi
    64  C    w0  rbx
    65  C    w1  rcx
    66  C    w2  rbp
    67  C    w3  r10
    68  C    tp  r12
    69  C    lo  rax
    70  C    hi  rdx
    71  C        rsp
    72  
    73  C INPUT PARAMETERS
    74  define(`rp',	  `%rdi')
    75  define(`up',	  `%rsi')
    76  define(`n_param', `%rdx')
    77  
    78  define(`n',	`%r11')
    79  define(`tp',	`%r12')
    80  define(`i',	`%r8')
    81  define(`j',	`%r9')
    82  define(`v0',	`%r13')
    83  define(`v1',	`%r14')
    84  define(`w0',	`%rbx')
    85  define(`w1',	`%rcx')
    86  define(`w2',	`%rbp')
    87  define(`w3',	`%r10')
    88  
    89  ABI_SUPPORT(DOS64)
    90  ABI_SUPPORT(STD64)
    91  
    92  ASM_START()
    93  	TEXT
    94  	ALIGN(16)
    95  PROLOGUE(mpn_sqr_basecase)
    96  	FUNC_ENTRY(3)
    97  	mov	R32(n_param), R32(%rcx)
    98  	mov	R32(n_param), R32(n)		C free original n register (rdx)
    99  
   100  	add	$-40, %rsp
   101  
   102  	and	$3, R32(%rcx)
   103  	cmp	$4, R32(n_param)
   104  	lea	4(%rcx), %r8
   105  
   106  	mov	%rbx, 32(%rsp)
   107  	mov	%rbp, 24(%rsp)
   108  	mov	%r12, 16(%rsp)
   109  	mov	%r13, 8(%rsp)
   110  	mov	%r14, (%rsp)
   111  
   112  	cmovg	%r8, %rcx
   113  
   114  	lea	L(tab)(%rip), %rax
   115  ifdef(`PIC',
   116  `	movslq	(%rax,%rcx,4), %r10
   117  	add	%r10, %rax
   118  	jmp	*%rax
   119  ',`
   120  	jmp	*(%rax,%rcx,8)
   121  ')
   122  	JUMPTABSECT
   123  	ALIGN(8)
   124  L(tab):	JMPENT(	L(4), L(tab))
   125  	JMPENT(	L(1), L(tab))
   126  	JMPENT(	L(2), L(tab))
   127  	JMPENT(	L(3), L(tab))
   128  	JMPENT(	L(0m4), L(tab))
   129  	JMPENT(	L(1m4), L(tab))
   130  	JMPENT(	L(2m4), L(tab))
   131  	JMPENT(	L(3m4), L(tab))
   132  	TEXT
   133  
   134  L(1):	mov	(up), %rax
   135  	mul	%rax
   136  	add	$40, %rsp
   137  	mov	%rax, (rp)
   138  	mov	%rdx, 8(rp)
   139  	FUNC_EXIT()
   140  	ret
   141  
   142  L(2):	mov	(up), %rax
   143  	mov	%rax, %r8
   144  	mul	%rax
   145  	mov	8(up), %r11
   146  	mov	%rax, (rp)
   147  	mov	%r11, %rax
   148  	mov	%rdx, %r9
   149  	mul	%rax
   150  	add	$40, %rsp
   151  	mov	%rax, %r10
   152  	mov	%r11, %rax
   153  	mov	%rdx, %r11
   154  	mul	%r8
   155  	xor	%r8, %r8
   156  	add	%rax, %r9
   157  	adc	%rdx, %r10
   158  	adc	%r8, %r11
   159  	add	%rax, %r9
   160  	mov	%r9, 8(rp)
   161  	adc	%rdx, %r10
   162  	mov	%r10, 16(rp)
   163  	adc	%r8, %r11
   164  	mov	%r11, 24(rp)
   165  	FUNC_EXIT()
   166  	ret
   167  
   168  L(3):	mov	(up), %rax
   169  	mov	%rax, %r10
   170  	mul	%rax
   171  	mov	8(up), %r11
   172  	mov	%rax, (rp)
   173  	mov	%r11, %rax
   174  	mov	%rdx, 8(rp)
   175  	mul	%rax
   176  	mov	16(up), %rcx
   177  	mov	%rax, 16(rp)
   178  	mov	%rcx, %rax
   179  	mov	%rdx, 24(rp)
   180  	mul	%rax
   181  	mov	%rax, 32(rp)
   182  	mov	%rdx, 40(rp)
   183  
   184  	mov	%r11, %rax
   185  	mul	%r10
   186  	mov	%rax, %r8
   187  	mov	%rcx, %rax
   188  	mov	%rdx, %r9
   189  	mul	%r10
   190  	xor	%r10, %r10
   191  	add	%rax, %r9
   192  	mov	%r11, %rax
   193  	mov	%r10, %r11
   194  	adc	%rdx, %r10
   195  
   196  	mul	%rcx
   197  	add	$40, %rsp
   198  	add	%rax, %r10
   199  	adc	%r11, %rdx
   200  	add	%r8, %r8
   201  	adc	%r9, %r9
   202  	adc	%r10, %r10
   203  	adc	%rdx, %rdx
   204  	adc	%r11, %r11
   205  	add	%r8, 8(rp)
   206  	adc	%r9, 16(rp)
   207  	adc	%r10, 24(rp)
   208  	adc	%rdx, 32(rp)
   209  	adc	%r11, 40(rp)
   210  	FUNC_EXIT()
   211  	ret
   212  
   213  L(4):	mov	(up), %rax
   214  	mov	%rax, %r11
   215  	mul	%rax
   216  	mov	8(up), %rbx
   217  	mov	%rax, (rp)
   218  	mov	%rbx, %rax
   219  	mov	%rdx, 8(rp)
   220  	mul	%rax
   221  	mov	%rax, 16(rp)
   222  	mov	%rdx, 24(rp)
   223  	mov	16(up), %rax
   224  	mul	%rax
   225  	mov	%rax, 32(rp)
   226  	mov	%rdx, 40(rp)
   227  	mov	24(up), %rax
   228  	mul	%rax
   229  	mov	%rax, 48(rp)
   230  	mov	%rbx, %rax
   231  	mov	%rdx, 56(rp)
   232  
   233  	mul	%r11
   234  	add	$32, %rsp
   235  	mov	%rax, %r8
   236  	mov	%rdx, %r9
   237  	mov	16(up), %rax
   238  	mul	%r11
   239  	xor	%r10, %r10
   240  	add	%rax, %r9
   241  	adc	%rdx, %r10
   242  	mov	24(up), %rax
   243  	mul	%r11
   244  	xor	%r11, %r11
   245  	add	%rax, %r10
   246  	adc	%rdx, %r11
   247  	mov	16(up), %rax
   248  	mul	%rbx
   249  	xor	%rcx, %rcx
   250  	add	%rax, %r10
   251  	adc	%rdx, %r11
   252  	adc	$0, %rcx
   253  	mov	24(up), %rax
   254  	mul	%rbx
   255  	pop	%rbx
   256  	add	%rax, %r11
   257  	adc	%rdx, %rcx
   258  	mov	16(up), %rdx
   259  	mov	24(up), %rax
   260  	mul	%rdx
   261  	add	%rax, %rcx
   262  	adc	$0, %rdx
   263  
   264  	add	%r8, %r8
   265  	adc	%r9, %r9
   266  	adc	%r10, %r10
   267  	adc	%r11, %r11
   268  	adc	%rcx, %rcx
   269  	mov	$0, R32(%rax)
   270  	adc	%rdx, %rdx
   271  
   272  	adc	%rax, %rax
   273  	add	%r8, 8(rp)
   274  	adc	%r9, 16(rp)
   275  	adc	%r10, 24(rp)
   276  	adc	%r11, 32(rp)
   277  	adc	%rcx, 40(rp)
   278  	adc	%rdx, 48(rp)
   279  	adc	%rax, 56(rp)
   280  	FUNC_EXIT()
   281  	ret
   282  
   283  
   284  L(0m4):
   285  	lea	-16(rp,n,8), tp		C point tp in middle of result operand
   286  	mov	(up), v0
   287  	mov	8(up), %rax
   288  	lea	(up,n,8), up		C point up at end of input operand
   289  
   290  	lea	-4(n), i
   291  C Function mpn_mul_1_m3(tp, up - i, i, up[-i - 1])
   292  	xor	R32(j), R32(j)
   293  	sub	n, j
   294  
   295  	mul	v0
   296  	xor	R32(w2), R32(w2)
   297  	mov	%rax, w0
   298  	mov	16(up,j,8), %rax
   299  	mov	%rdx, w3
   300  	jmp	L(L3)
   301  
   302  	ALIGN(16)
   303  L(mul_1_m3_top):
   304  	add	%rax, w2
   305  	mov	w3, (tp,j,8)
   306  	mov	(up,j,8), %rax
   307  	adc	%rdx, w1
   308  	xor	R32(w0), R32(w0)
   309  	mul	v0
   310  	xor	R32(w3), R32(w3)
   311  	mov	w2, 8(tp,j,8)
   312  	add	%rax, w1
   313  	adc	%rdx, w0
   314  	mov	8(up,j,8), %rax
   315  	mov	w1, 16(tp,j,8)
   316  	xor	R32(w2), R32(w2)
   317  	mul	v0
   318  	add	%rax, w0
   319  	mov	16(up,j,8), %rax
   320  	adc	%rdx, w3
   321  L(L3):	xor	R32(w1), R32(w1)
   322  	mul	v0
   323  	add	%rax, w3
   324  	mov	24(up,j,8), %rax
   325  	adc	%rdx, w2
   326  	mov	w0, 24(tp,j,8)
   327  	mul	v0
   328  	add	$4, j
   329  	js	L(mul_1_m3_top)
   330  
   331  	add	%rax, w2
   332  	mov	w3, (tp)
   333  	adc	%rdx, w1
   334  	mov	w2, 8(tp)
   335  	mov	w1, 16(tp)
   336  
   337  	lea	eval(2*8)(tp), tp	C tp += 2
   338  	lea	-8(up), up
   339  	jmp	L(dowhile)
   340  
   341  
   342  L(1m4):
   343  	lea	8(rp,n,8), tp		C point tp in middle of result operand
   344  	mov	(up), v0		C u0
   345  	mov	8(up), %rax		C u1
   346  	lea	8(up,n,8), up		C point up at end of input operand
   347  
   348  	lea	-3(n), i
   349  C Function mpn_mul_2s_m0(tp, up - i, i, up - i - 1)
   350  	lea	-3(n), j
   351  	neg	j
   352  
   353  	mov	%rax, v1		C u1
   354  	mul	v0			C u0 * u1
   355  	mov	%rdx, w1
   356  	xor	R32(w2), R32(w2)
   357  	mov	%rax, 8(rp)
   358  	jmp	L(m0)
   359  
   360  	ALIGN(16)
   361  L(mul_2_m0_top):
   362  	mul	v1
   363  	add	%rax, w0
   364  	adc	%rdx, w1
   365  	mov	-24(up,j,8), %rax
   366  	mov	$0, R32(w2)
   367  	mul	v0
   368  	add	%rax, w0
   369  	mov	-24(up,j,8), %rax
   370  	adc	%rdx, w1
   371  	adc	$0, R32(w2)
   372  	mul	v1			C v1 * u0
   373  	add	%rax, w1
   374  	mov	w0, -24(tp,j,8)
   375  	adc	%rdx, w2
   376  L(m0):	mov	-16(up,j,8), %rax	C u2, u6 ...
   377  	mul	v0			C u0 * u2
   378  	mov	$0, R32(w3)
   379  	add	%rax, w1
   380  	adc	%rdx, w2
   381  	mov	-16(up,j,8), %rax
   382  	adc	$0, R32(w3)
   383  	mov	$0, R32(w0)
   384  	mov	w1, -16(tp,j,8)
   385  	mul	v1
   386  	add	%rax, w2
   387  	mov	-8(up,j,8), %rax
   388  	adc	%rdx, w3
   389  	mov	$0, R32(w1)
   390  	mul	v0
   391  	add	%rax, w2
   392  	mov	-8(up,j,8), %rax
   393  	adc	%rdx, w3
   394  	adc	$0, R32(w0)
   395  	mul	v1
   396  	add	%rax, w3
   397  	mov	w2, -8(tp,j,8)
   398  	adc	%rdx, w0
   399  L(m2x):	mov	(up,j,8), %rax
   400  	mul	v0
   401  	add	%rax, w3
   402  	adc	%rdx, w0
   403  	adc	$0, R32(w1)
   404  	add	$4, j
   405  	mov	-32(up,j,8), %rax
   406  	mov	w3, -32(tp,j,8)
   407  	js	L(mul_2_m0_top)
   408  
   409  	mul	v1
   410  	add	%rax, w0
   411  	adc	%rdx, w1
   412  	mov	w0, -8(tp)
   413  	mov	w1, (tp)
   414  
   415  	lea	-16(up), up
   416  	lea	eval(3*8-24)(tp), tp	C tp += 3
   417  	jmp	L(dowhile_end)
   418  
   419  
   420  L(2m4):
   421  	lea	-16(rp,n,8), tp		C point tp in middle of result operand
   422  	mov	(up), v0
   423  	mov	8(up), %rax
   424  	lea	(up,n,8), up		C point up at end of input operand
   425  
   426  	lea	-4(n), i
   427  C Function mpn_mul_1_m1(tp, up - (i - 1), i - 1, up[-i])
   428  	lea	-2(n), j
   429  	neg	j
   430  
   431  	mul	v0
   432  	mov	%rax, w2
   433  	mov	(up,j,8), %rax
   434  	mov	%rdx, w1
   435  	jmp	L(L1)
   436  
   437  	ALIGN(16)
   438  L(mul_1_m1_top):
   439  	add	%rax, w2
   440  	mov	w3, (tp,j,8)
   441  	mov	(up,j,8), %rax
   442  	adc	%rdx, w1
   443  L(L1):	xor	R32(w0), R32(w0)
   444  	mul	v0
   445  	xor	R32(w3), R32(w3)
   446  	mov	w2, 8(tp,j,8)
   447  	add	%rax, w1
   448  	adc	%rdx, w0
   449  	mov	8(up,j,8), %rax
   450  	mov	w1, 16(tp,j,8)
   451  	xor	R32(w2), R32(w2)
   452  	mul	v0
   453  	add	%rax, w0
   454  	mov	16(up,j,8), %rax
   455  	adc	%rdx, w3
   456  	xor	R32(w1), R32(w1)
   457  	mul	v0
   458  	add	%rax, w3
   459  	mov	24(up,j,8), %rax
   460  	adc	%rdx, w2
   461  	mov	w0, 24(tp,j,8)
   462  	mul	v0
   463  	add	$4, j
   464  	js	L(mul_1_m1_top)
   465  
   466  	add	%rax, w2
   467  	mov	w3, (tp)
   468  	adc	%rdx, w1
   469  	mov	w2, 8(tp)
   470  	mov	w1, 16(tp)
   471  
   472  	lea	eval(2*8)(tp), tp	C tp += 2
   473  	lea	-8(up), up
   474  	jmp	L(dowhile_mid)
   475  
   476  
   477  L(3m4):
   478  	lea	8(rp,n,8), tp		C point tp in middle of result operand
   479  	mov	(up), v0		C u0
   480  	mov	8(up), %rax		C u1
   481  	lea	8(up,n,8), up		C point up at end of input operand
   482  
   483  	lea	-5(n), i
   484  C Function mpn_mul_2s_m2(tp, up - i + 1, i - 1, up - i)
   485  	lea	-1(n), j
   486  	neg	j
   487  
   488  	mov	%rax, v1		C u1
   489  	mul	v0			C u0 * u1
   490  	mov	%rdx, w3
   491  	xor	R32(w0), R32(w0)
   492  	xor	R32(w1), R32(w1)
   493  	mov	%rax, 8(rp)
   494  	jmp	L(m2)
   495  
   496  	ALIGN(16)
   497  L(mul_2_m2_top):
   498  	mul	v1
   499  	add	%rax, w0
   500  	adc	%rdx, w1
   501  	mov	-24(up,j,8), %rax
   502  	mov	$0, R32(w2)
   503  	mul	v0
   504  	add	%rax, w0
   505  	mov	-24(up,j,8), %rax
   506  	adc	%rdx, w1
   507  	adc	$0, R32(w2)
   508  	mul	v1			C v1 * u0
   509  	add	%rax, w1
   510  	mov	w0, -24(tp,j,8)
   511  	adc	%rdx, w2
   512  	mov	-16(up,j,8), %rax
   513  	mul	v0
   514  	mov	$0, R32(w3)
   515  	add	%rax, w1
   516  	adc	%rdx, w2
   517  	mov	-16(up,j,8), %rax
   518  	adc	$0, R32(w3)
   519  	mov	$0, R32(w0)
   520  	mov	w1, -16(tp,j,8)
   521  	mul	v1
   522  	add	%rax, w2
   523  	mov	-8(up,j,8), %rax
   524  	adc	%rdx, w3
   525  	mov	$0, R32(w1)
   526  	mul	v0
   527  	add	%rax, w2
   528  	mov	-8(up,j,8), %rax
   529  	adc	%rdx, w3
   530  	adc	$0, R32(w0)
   531  	mul	v1
   532  	add	%rax, w3
   533  	mov	w2, -8(tp,j,8)
   534  	adc	%rdx, w0
   535  L(m2):	mov	(up,j,8), %rax
   536  	mul	v0
   537  	add	%rax, w3
   538  	adc	%rdx, w0
   539  	adc	$0, R32(w1)
   540  	add	$4, j
   541  	mov	-32(up,j,8), %rax
   542  	mov	w3, -32(tp,j,8)
   543  	js	L(mul_2_m2_top)
   544  
   545  	mul	v1
   546  	add	%rax, w0
   547  	adc	%rdx, w1
   548  	mov	w0, -8(tp)
   549  	mov	w1, (tp)
   550  
   551  	lea	-16(up), up
   552  	jmp	L(dowhile_mid)
   553  
   554  L(dowhile):
   555  C Function mpn_addmul_2s_m2(tp, up - (i - 1), i - 1, up - i)
   556  	lea	4(i), j
   557  	neg	j
   558  
   559  	mov	16(up,j,8), v0
   560  	mov	24(up,j,8), v1
   561  	mov	24(up,j,8), %rax
   562  	mul	v0
   563  	xor	R32(w3), R32(w3)
   564  	add	%rax, 24(tp,j,8)
   565  	adc	%rdx, w3
   566  	xor	R32(w0), R32(w0)
   567  	xor	R32(w1), R32(w1)
   568  	jmp	L(am2)
   569  
   570  	ALIGN(16)
   571  L(addmul_2_m2_top):
   572  	add	w3, (tp,j,8)
   573  	adc	%rax, w0
   574  	mov	8(up,j,8), %rax
   575  	adc	%rdx, w1
   576  	mov	$0, R32(w2)
   577  	mul	v0
   578  	add	%rax, w0
   579  	mov	8(up,j,8), %rax
   580  	adc	%rdx, w1
   581  	adc	$0, R32(w2)
   582  	mul	v1				C v1 * u0
   583  	add	w0, 8(tp,j,8)
   584  	adc	%rax, w1
   585  	adc	%rdx, w2
   586  	mov	16(up,j,8), %rax
   587  	mov	$0, R32(w3)
   588  	mul	v0				C v0 * u1
   589  	add	%rax, w1
   590  	mov	16(up,j,8), %rax
   591  	adc	%rdx, w2
   592  	adc	$0, R32(w3)
   593  	mul	v1				C v1 * u1
   594  	add	w1, 16(tp,j,8)
   595  	adc	%rax, w2
   596  	mov	24(up,j,8), %rax
   597  	adc	%rdx, w3
   598  	mul	v0
   599  	mov	$0, R32(w0)
   600  	add	%rax, w2
   601  	adc	%rdx, w3
   602  	mov	$0, R32(w1)
   603  	mov	24(up,j,8), %rax
   604  	adc	$0, R32(w0)
   605  	mul	v1
   606  	add	w2, 24(tp,j,8)
   607  	adc	%rax, w3
   608  	adc	%rdx, w0
   609  L(am2):	mov	32(up,j,8), %rax
   610  	mul	v0
   611  	add	%rax, w3
   612  	mov	32(up,j,8), %rax
   613  	adc	%rdx, w0
   614  	adc	$0, R32(w1)
   615  	mul	v1
   616  	add	$4, j
   617  	js	L(addmul_2_m2_top)
   618  
   619  	add	w3, (tp)
   620  	adc	%rax, w0
   621  	adc	%rdx, w1
   622  	mov	w0, 8(tp)
   623  	mov	w1, 16(tp)
   624  
   625  	lea	eval(2*8)(tp), tp	C tp += 2
   626  
   627  	add	$-2, R32(i)		C i -= 2
   628  
   629  L(dowhile_mid):
   630  C Function mpn_addmul_2s_m0(tp, up - (i - 1), i - 1, up - i)
   631  	lea	2(i), j
   632  	neg	j
   633  
   634  	mov	(up,j,8), v0
   635  	mov	8(up,j,8), v1
   636  	mov	8(up,j,8), %rax
   637  	mul	v0
   638  	xor	R32(w1), R32(w1)
   639  	add	%rax, 8(tp,j,8)
   640  	adc	%rdx, w1
   641  	xor	R32(w2), R32(w2)
   642  	jmp	L(20)
   643  
   644  	ALIGN(16)
   645  L(addmul_2_m0_top):
   646  	add	w3, (tp,j,8)
   647  	adc	%rax, w0
   648  	mov	8(up,j,8), %rax
   649  	adc	%rdx, w1
   650  	mov	$0, R32(w2)
   651  	mul	v0
   652  	add	%rax, w0
   653  	mov	8(up,j,8), %rax
   654  	adc	%rdx, w1
   655  	adc	$0, R32(w2)
   656  	mul	v1				C v1 * u0
   657  	add	w0, 8(tp,j,8)
   658  	adc	%rax, w1
   659  	adc	%rdx, w2
   660  L(20):	mov	16(up,j,8), %rax
   661  	mov	$0, R32(w3)
   662  	mul	v0				C v0 * u1
   663  	add	%rax, w1
   664  	mov	16(up,j,8), %rax
   665  	adc	%rdx, w2
   666  	adc	$0, R32(w3)
   667  	mul	v1				C v1 * u1
   668  	add	w1, 16(tp,j,8)
   669  	adc	%rax, w2
   670  	mov	24(up,j,8), %rax
   671  	adc	%rdx, w3
   672  	mul	v0
   673  	mov	$0, R32(w0)
   674  	add	%rax, w2
   675  	adc	%rdx, w3
   676  	mov	$0, R32(w1)
   677  	mov	24(up,j,8), %rax
   678  	adc	$0, R32(w0)
   679  	mul	v1
   680  	add	w2, 24(tp,j,8)
   681  	adc	%rax, w3
   682  	adc	%rdx, w0
   683  	mov	32(up,j,8), %rax
   684  	mul	v0
   685  	add	%rax, w3
   686  	mov	32(up,j,8), %rax
   687  	adc	%rdx, w0
   688  	adc	$0, R32(w1)
   689  	mul	v1
   690  	add	$4, j
   691  	js	L(addmul_2_m0_top)
   692  
   693  	add	w3, (tp)
   694  	adc	%rax, w0
   695  	adc	%rdx, w1
   696  	mov	w0, 8(tp)
   697  	mov	w1, 16(tp)
   698  
   699  	lea	eval(2*8)(tp), tp	C tp += 2
   700  L(dowhile_end):
   701  
   702  	add	$-2, R32(i)		C i -= 2
   703  	jne	L(dowhile)
   704  
   705  C Function mpn_addmul_2s_2
   706  	mov	-16(up), v0
   707  	mov	-8(up), v1
   708  	mov	-8(up), %rax
   709  	mul	v0
   710  	xor	R32(w3), R32(w3)
   711  	add	%rax, -8(tp)
   712  	adc	%rdx, w3
   713  	xor	R32(w0), R32(w0)
   714  	xor	R32(w1), R32(w1)
   715  	mov	(up), %rax
   716  	mul	v0
   717  	add	%rax, w3
   718  	mov	(up), %rax
   719  	adc	%rdx, w0
   720  	mul	v1
   721  	add	w3, (tp)
   722  	adc	%rax, w0
   723  	adc	%rdx, w1
   724  	mov	w0, 8(tp)
   725  	mov	w1, 16(tp)
   726  
   727  C Function mpn_sqr_diag_addlsh1
   728  	lea	-4(n,n), j
   729  
   730  	mov	8(rp), %r11
   731  	lea	-8(up), up
   732  	lea	(rp,j,8), rp
   733  	neg	j
   734  	mov	(up,j,4), %rax
   735  	mul	%rax
   736  	test	$2, R8(j)
   737  	jnz	L(odd)
   738  
   739  L(evn):	add	%r11, %r11
   740  	sbb	R32(%rbx), R32(%rbx)		C save CF
   741  	add	%rdx, %r11
   742  	mov	%rax, (rp,j,8)
   743  	jmp	L(d0)
   744  
   745  L(odd):	add	%r11, %r11
   746  	sbb	R32(%rbp), R32(%rbp)		C save CF
   747  	add	%rdx, %r11
   748  	mov	%rax, (rp,j,8)
   749  	lea	-2(j), j
   750  	jmp	L(d1)
   751  
   752  	ALIGN(16)
   753  L(top):	mov	(up,j,4), %rax
   754  	mul	%rax
   755  	add	R32(%rbp), R32(%rbp)		C restore carry
   756  	adc	%rax, %r10
   757  	adc	%rdx, %r11
   758  	mov	%r10, (rp,j,8)
   759  L(d0):	mov	%r11, 8(rp,j,8)
   760  	mov	16(rp,j,8), %r10
   761  	adc	%r10, %r10
   762  	mov	24(rp,j,8), %r11
   763  	adc	%r11, %r11
   764  	nop
   765  	sbb	R32(%rbp), R32(%rbp)		C save CF
   766  	mov	8(up,j,4), %rax
   767  	mul	%rax
   768  	add	R32(%rbx), R32(%rbx)		C restore carry
   769  	adc	%rax, %r10
   770  	adc	%rdx, %r11
   771  	mov	%r10, 16(rp,j,8)
   772  L(d1):	mov	%r11, 24(rp,j,8)
   773  	mov	32(rp,j,8), %r10
   774  	adc	%r10, %r10
   775  	mov	40(rp,j,8), %r11
   776  	adc	%r11, %r11
   777  	sbb	R32(%rbx), R32(%rbx)		C save CF
   778  	add	$4, j
   779  	js	L(top)
   780  
   781  	mov	(up), %rax
   782  	mul	%rax
   783  	add	R32(%rbp), R32(%rbp)		C restore carry
   784  	adc	%rax, %r10
   785  	adc	%rdx, %r11
   786  	mov	%r10, (rp)
   787  	mov	%r11, 8(rp)
   788  	mov	16(rp), %r10
   789  	adc	%r10, %r10
   790  	sbb	R32(%rbp), R32(%rbp)		C save CF
   791  	neg	R32(%rbp)
   792  	mov	8(up), %rax
   793  	mul	%rax
   794  	add	R32(%rbx), R32(%rbx)		C restore carry
   795  	adc	%rax, %r10
   796  	adc	%rbp, %rdx
   797  	mov	%r10, 16(rp)
   798  	mov	%rdx, 24(rp)
   799  
   800  	pop	%r14
   801  	pop	%r13
   802  	pop	%r12
   803  	pop	%rbp
   804  	pop	%rbx
   805  	FUNC_EXIT()
   806  	ret
   807  EPILOGUE()