github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/pa64/addmul_1.asm (about)

     1  dnl  HP-PA 2.0 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and
     2  dnl  add the result to a second limb vector.
     3  
     4  dnl  Copyright 1998-2000, 2002, 2003 Free Software Foundation, Inc.
     5  
     6  dnl  This file is part of the GNU MP Library.
     7  dnl
     8  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     9  dnl  it under the terms of either:
    10  dnl
    11  dnl    * the GNU Lesser General Public License as published by the Free
    12  dnl      Software Foundation; either version 3 of the License, or (at your
    13  dnl      option) any later version.
    14  dnl
    15  dnl  or
    16  dnl
    17  dnl    * the GNU General Public License as published by the Free Software
    18  dnl      Foundation; either version 2 of the License, or (at your option) any
    19  dnl      later version.
    20  dnl
    21  dnl  or both in parallel, as here.
    22  dnl
    23  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    24  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    25  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    26  dnl  for more details.
    27  dnl
    28  dnl  You should have received copies of the GNU General Public License and the
    29  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    30  dnl  see https://www.gnu.org/licenses/.
    31  
    32  include(`../config.m4')
    33  
    34  C		    cycles/limb
    35  C 8000,8200:		7
    36  C 8500,8600,8700:	6.375
    37  
    38  C  The feed-in and wind-down code has not yet been scheduled.  Many cycles
    39  C  could be saved there per call.
    40  
    41  C  DESCRIPTION:
    42  C  The main loop "BIG" is 4-way unrolled, mainly to allow
    43  C  effective use of ADD,DC.  Delays in moving data via the cache from the FP
    44  C  registers to the IU registers, have demanded a deep software pipeline, and
    45  C  a lot of stack slots for partial products in flight.
    46  C
    47  C  CODE STRUCTURE:
    48  C  save-some-registers
    49  C  do 0, 1, 2, or 3 limbs
    50  C  if done, restore-some-regs and return
    51  C  save-many-regs
    52  C  do 4, 8, ... limb
    53  C  restore-all-regs
    54  
    55  C  STACK LAYOUT:
    56  C  HP-PA stack grows upwards.  We could allocate 8 fewer slots by using the
    57  C  slots marked FREE, as well as some slots in the caller's "frame marker".
    58  C
    59  C -00 <- r30
    60  C -08  FREE
    61  C -10  tmp
    62  C -18  tmp
    63  C -20  tmp
    64  C -28  tmp
    65  C -30  tmp
    66  C -38  tmp
    67  C -40  tmp
    68  C -48  tmp
    69  C -50  tmp
    70  C -58  tmp
    71  C -60  tmp
    72  C -68  tmp
    73  C -70  tmp
    74  C -78  tmp
    75  C -80  tmp
    76  C -88  tmp
    77  C -90  FREE
    78  C -98  FREE
    79  C -a0  FREE
    80  C -a8  FREE
    81  C -b0  r13
    82  C -b8  r12
    83  C -c0  r11
    84  C -c8  r10
    85  C -d0  r8
    86  C -d8  r8
    87  C -e0  r7
    88  C -e8  r6
    89  C -f0  r5
    90  C -f8  r4
    91  C -100 r3
    92  C  Previous frame:
    93  C  [unused area]
    94  C -38/-138 vlimb home slot.  For 2.0N, the vlimb arg will arrive here.
    95  
    96  
    97  include(`../config.m4')
    98  
    99  C INPUT PARAMETERS:
   100  define(`rp',`%r26')	C
   101  define(`up',`%r25')	C
   102  define(`n',`%r24')	C
   103  define(`vlimb',`%r23')	C
   104  
   105  define(`climb',`%r23')	C
   106  
   107  ifdef(`HAVE_ABI_2_0w',
   108  `	.level	2.0w
   109  ',`	.level	2.0
   110  ')
   111  PROLOGUE(mpn_addmul_1)
   112  
   113  ifdef(`HAVE_ABI_2_0w',
   114  `	std		vlimb, -0x38(%r30)	C store vlimb into "home" slot
   115  ')
   116  	std,ma		%r3, 0x100(%r30)
   117  	std		%r4, -0xf8(%r30)
   118  	std		%r5, -0xf0(%r30)
   119  	ldo		0(%r0), climb		C clear climb
   120  	fldd		-0x138(%r30), %fr8	C put vlimb in fp register
   121  
   122  define(`p032a1',`%r1')	C
   123  define(`p032a2',`%r19')	C
   124  
   125  define(`m032',`%r20')	C
   126  define(`m096',`%r21')	C
   127  
   128  define(`p000a',`%r22')	C
   129  define(`p064a',`%r29')	C
   130  
   131  define(`s000',`%r31')	C
   132  
   133  define(`ma000',`%r4')	C
   134  define(`ma064',`%r20')	C
   135  
   136  define(`r000',`%r3')	C
   137  
   138  	extrd,u		n, 63, 2, %r5
   139  	cmpb,=		%r5, %r0, L(BIG)
   140  	nop
   141  
   142  	fldd		0(up), %fr4
   143  	ldo		8(up), up
   144  	xmpyu		%fr8R, %fr4L, %fr22
   145  	xmpyu		%fr8L, %fr4R, %fr23
   146  	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
   147  	xmpyu		%fr8R, %fr4R, %fr24
   148  	xmpyu		%fr8L, %fr4L, %fr25
   149  	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
   150  	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
   151  	addib,<>	-1, %r5, L(two_or_more)
   152  	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
   153  LDEF(one)
   154  	ldd		-0x78(%r30), p032a1
   155  	ldd		-0x70(%r30), p032a2
   156  	ldd		-0x80(%r30), p000a
   157  	b		L(0_one_out)
   158  	ldd		-0x68(%r30), p064a
   159  
   160  LDEF(two_or_more)
   161  	fldd		0(up), %fr4
   162  	ldo		8(up), up
   163  	xmpyu		%fr8R, %fr4L, %fr22
   164  	xmpyu		%fr8L, %fr4R, %fr23
   165  	ldd		-0x78(%r30), p032a1
   166  	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
   167  	xmpyu		%fr8R, %fr4R, %fr24
   168  	xmpyu		%fr8L, %fr4L, %fr25
   169  	ldd		-0x70(%r30), p032a2
   170  	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
   171  	ldd		-0x80(%r30), p000a
   172  	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
   173  	ldd		-0x68(%r30), p064a
   174  	addib,<>	-1, %r5, L(three_or_more)
   175  	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
   176  LDEF(two)
   177  	add		p032a1, p032a2, m032
   178  	add,dc		%r0, %r0, m096
   179  	depd,z		m032, 31, 32, ma000
   180  	extrd,u		m032, 31, 32, ma064
   181  	ldd		0(rp), r000
   182  	b		L(0_two_out)
   183  	depd		m096, 31, 32, ma064
   184  
   185  LDEF(three_or_more)
   186  	fldd		0(up), %fr4
   187  	add		p032a1, p032a2, m032
   188  	add,dc		%r0, %r0, m096
   189  	depd,z		m032, 31, 32, ma000
   190  	extrd,u		m032, 31, 32, ma064
   191  	ldd		0(rp), r000
   192  C	addib,=		-1, %r5, L(0_out)
   193  	depd		m096, 31, 32, ma064
   194  LDEF(loop0)
   195  C	xmpyu		%fr8R, %fr4L, %fr22
   196  C	xmpyu		%fr8L, %fr4R, %fr23
   197  C	ldd		-0x78(%r30), p032a1
   198  C	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
   199  C
   200  C	xmpyu		%fr8R, %fr4R, %fr24
   201  C	xmpyu		%fr8L, %fr4L, %fr25
   202  C	ldd		-0x70(%r30), p032a2
   203  C	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
   204  C
   205  C	ldo		8(rp), rp
   206  C	add		climb, p000a, s000
   207  C	ldd		-0x80(%r30), p000a
   208  C	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
   209  C
   210  C	add,dc		p064a, %r0, climb
   211  C	ldo		8(up), up
   212  C	ldd		-0x68(%r30), p064a
   213  C	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
   214  C
   215  C	add		ma000, s000, s000
   216  C	add,dc		ma064, climb, climb
   217  C	fldd		0(up), %fr4
   218  C
   219  C	add		r000, s000, s000
   220  C	add,dc		%r0, climb, climb
   221  C	std		s000, -8(rp)
   222  C
   223  C	add		p032a1, p032a2, m032
   224  C	add,dc		%r0, %r0, m096
   225  C
   226  C	depd,z		m032, 31, 32, ma000
   227  C	extrd,u		m032, 31, 32, ma064
   228  C	ldd		0(rp), r000
   229  C	addib,<>	-1, %r5, L(loop0)
   230  C	depd		m096, 31, 32, ma064
   231  LDEF(0_out)
   232  	ldo		8(up), up
   233  	xmpyu		%fr8R, %fr4L, %fr22
   234  	xmpyu		%fr8L, %fr4R, %fr23
   235  	ldd		-0x78(%r30), p032a1
   236  	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
   237  	xmpyu		%fr8R, %fr4R, %fr24
   238  	xmpyu		%fr8L, %fr4L, %fr25
   239  	ldd		-0x70(%r30), p032a2
   240  	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
   241  	ldo		8(rp), rp
   242  	add		climb, p000a, s000
   243  	ldd		-0x80(%r30), p000a
   244  	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
   245  	add,dc		p064a, %r0, climb
   246  	ldd		-0x68(%r30), p064a
   247  	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
   248  	add		ma000, s000, s000
   249  	add,dc		ma064, climb, climb
   250  	add		r000, s000, s000
   251  	add,dc		%r0, climb, climb
   252  	std		s000, -8(rp)
   253  	add		p032a1, p032a2, m032
   254  	add,dc		%r0, %r0, m096
   255  	depd,z		m032, 31, 32, ma000
   256  	extrd,u		m032, 31, 32, ma064
   257  	ldd		0(rp), r000
   258  	depd		m096, 31, 32, ma064
   259  LDEF(0_two_out)
   260  	ldd		-0x78(%r30), p032a1
   261  	ldd		-0x70(%r30), p032a2
   262  	ldo		8(rp), rp
   263  	add		climb, p000a, s000
   264  	ldd		-0x80(%r30), p000a
   265  	add,dc		p064a, %r0, climb
   266  	ldd		-0x68(%r30), p064a
   267  	add		ma000, s000, s000
   268  	add,dc		ma064, climb, climb
   269  	add		r000, s000, s000
   270  	add,dc		%r0, climb, climb
   271  	std		s000, -8(rp)
   272  LDEF(0_one_out)
   273  	add		p032a1, p032a2, m032
   274  	add,dc		%r0, %r0, m096
   275  	depd,z		m032, 31, 32, ma000
   276  	extrd,u		m032, 31, 32, ma064
   277  	ldd		0(rp), r000
   278  	depd		m096, 31, 32, ma064
   279  
   280  	add		climb, p000a, s000
   281  	add,dc		p064a, %r0, climb
   282  	add		ma000, s000, s000
   283  	add,dc		ma064, climb, climb
   284  	add		r000, s000, s000
   285  	add,dc		%r0, climb, climb
   286  	std		s000, 0(rp)
   287  
   288  	cmpib,>=	4, n, L(done)
   289  	ldo		8(rp), rp
   290  
   291  C 4-way unrolled code.
   292  
   293  LDEF(BIG)
   294  
   295  define(`p032a1',`%r1')	C
   296  define(`p032a2',`%r19')	C
   297  define(`p096b1',`%r20')	C
   298  define(`p096b2',`%r21')	C
   299  define(`p160c1',`%r22')	C
   300  define(`p160c2',`%r29')	C
   301  define(`p224d1',`%r31')	C
   302  define(`p224d2',`%r3')	C
   303  			C
   304  define(`m032',`%r4')	C
   305  define(`m096',`%r5')	C
   306  define(`m160',`%r6')	C
   307  define(`m224',`%r7')	C
   308  define(`m288',`%r8')	C
   309  			C
   310  define(`p000a',`%r1')	C
   311  define(`p064a',`%r19')	C
   312  define(`p064b',`%r20')	C
   313  define(`p128b',`%r21')	C
   314  define(`p128c',`%r22')	C
   315  define(`p192c',`%r29')	C
   316  define(`p192d',`%r31')	C
   317  define(`p256d',`%r3')	C
   318  			C
   319  define(`s000',`%r10')	C
   320  define(`s064',`%r11')	C
   321  define(`s128',`%r12')	C
   322  define(`s192',`%r13')	C
   323  			C
   324  define(`ma000',`%r9')	C
   325  define(`ma064',`%r4')	C
   326  define(`ma128',`%r5')	C
   327  define(`ma192',`%r6')	C
   328  define(`ma256',`%r7')	C
   329  			C
   330  define(`r000',`%r1')	C
   331  define(`r064',`%r19')	C
   332  define(`r128',`%r20')	C
   333  define(`r192',`%r21')	C
   334  
   335  	std		%r6, -0xe8(%r30)
   336  	std		%r7, -0xe0(%r30)
   337  	std		%r8, -0xd8(%r30)
   338  	std		%r9, -0xd0(%r30)
   339  	std		%r10, -0xc8(%r30)
   340  	std		%r11, -0xc0(%r30)
   341  	std		%r12, -0xb8(%r30)
   342  	std		%r13, -0xb0(%r30)
   343  
   344  ifdef(`HAVE_ABI_2_0w',
   345  `	extrd,u		n, 61, 62, n		C right shift 2
   346  ',`	extrd,u		n, 61, 30, n		C right shift 2, zero extend
   347  ')
   348  
   349  LDEF(4_or_more)
   350  	fldd		0(up), %fr4
   351  	fldd		8(up), %fr5
   352  	fldd		16(up), %fr6
   353  	fldd		24(up), %fr7
   354  	xmpyu		%fr8R, %fr4L, %fr22
   355  	xmpyu		%fr8L, %fr4R, %fr23
   356  	xmpyu		%fr8R, %fr5L, %fr24
   357  	xmpyu		%fr8L, %fr5R, %fr25
   358  	xmpyu		%fr8R, %fr6L, %fr26
   359  	xmpyu		%fr8L, %fr6R, %fr27
   360  	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
   361  	xmpyu		%fr8R, %fr7L, %fr28
   362  	xmpyu		%fr8L, %fr7R, %fr29
   363  	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
   364  	xmpyu		%fr8R, %fr4R, %fr30
   365  	xmpyu		%fr8L, %fr4L, %fr31
   366  	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
   367  	xmpyu		%fr8R, %fr5R, %fr22
   368  	xmpyu		%fr8L, %fr5L, %fr23
   369  	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
   370  	xmpyu		%fr8R, %fr6R, %fr24
   371  	xmpyu		%fr8L, %fr6L, %fr25
   372  	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
   373  	xmpyu		%fr8R, %fr7R, %fr26
   374  	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
   375  	addib,<>	-1, n, L(8_or_more)
   376  	xmpyu		%fr8L, %fr7L, %fr27
   377  	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
   378  	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
   379  	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
   380  	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
   381  	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
   382  	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
   383  	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
   384  	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
   385  	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
   386  	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
   387  	ldd		-0x78(%r30), p032a1
   388  	ldd		-0x70(%r30), p032a2
   389  	ldd		-0x38(%r30), p096b1
   390  	ldd		-0x30(%r30), p096b2
   391  	ldd		-0x58(%r30), p160c1
   392  	ldd		-0x50(%r30), p160c2
   393  	ldd		-0x18(%r30), p224d1
   394  	ldd		-0x10(%r30), p224d2
   395  	b		L(end1)
   396  	nop
   397  
   398  LDEF(8_or_more)
   399  	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
   400  	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
   401  	ldo		32(up), up
   402  	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
   403  	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
   404  	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
   405  	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
   406  	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
   407  	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
   408  	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
   409  	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
   410  	fldd		0(up), %fr4
   411  	fldd		8(up), %fr5
   412  	fldd		16(up), %fr6
   413  	fldd		24(up), %fr7
   414  	xmpyu		%fr8R, %fr4L, %fr22
   415  	ldd		-0x78(%r30), p032a1
   416  	xmpyu		%fr8L, %fr4R, %fr23
   417  	xmpyu		%fr8R, %fr5L, %fr24
   418  	ldd		-0x70(%r30), p032a2
   419  	xmpyu		%fr8L, %fr5R, %fr25
   420  	xmpyu		%fr8R, %fr6L, %fr26
   421  	ldd		-0x38(%r30), p096b1
   422  	xmpyu		%fr8L, %fr6R, %fr27
   423  	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
   424  	xmpyu		%fr8R, %fr7L, %fr28
   425  	ldd		-0x30(%r30), p096b2
   426  	xmpyu		%fr8L, %fr7R, %fr29
   427  	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
   428  	xmpyu		%fr8R, %fr4R, %fr30
   429  	ldd		-0x58(%r30), p160c1
   430  	xmpyu		%fr8L, %fr4L, %fr31
   431  	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
   432  	xmpyu		%fr8R, %fr5R, %fr22
   433  	ldd		-0x50(%r30), p160c2
   434  	xmpyu		%fr8L, %fr5L, %fr23
   435  	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
   436  	xmpyu		%fr8R, %fr6R, %fr24
   437  	ldd		-0x18(%r30), p224d1
   438  	xmpyu		%fr8L, %fr6L, %fr25
   439  	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
   440  	xmpyu		%fr8R, %fr7R, %fr26
   441  	ldd		-0x10(%r30), p224d2
   442  	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
   443  	addib,=		-1, n, L(end2)
   444  	xmpyu		%fr8L, %fr7L, %fr27
   445  LDEF(loop)
   446  	add		p032a1, p032a2, m032
   447  	ldd		-0x80(%r30), p000a
   448  	add,dc		p096b1, p096b2, m096
   449  	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
   450  
   451  	add,dc		p160c1, p160c2, m160
   452  	ldd		-0x68(%r30), p064a
   453  	add,dc		p224d1, p224d2, m224
   454  	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
   455  
   456  	add,dc		%r0, %r0, m288
   457  	ldd		-0x40(%r30), p064b
   458  	ldo		32(up), up
   459  	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
   460  
   461  	depd,z		m032, 31, 32, ma000
   462  	ldd		-0x28(%r30), p128b
   463  	extrd,u		m032, 31, 32, ma064
   464  	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
   465  
   466  	depd		m096, 31, 32, ma064
   467  	ldd		-0x60(%r30), p128c
   468  	extrd,u		m096, 31, 32, ma128
   469  	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
   470  
   471  	depd		m160, 31, 32, ma128
   472  	ldd		-0x48(%r30), p192c
   473  	extrd,u		m160, 31, 32, ma192
   474  	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
   475  
   476  	depd		m224, 31, 32, ma192
   477  	ldd		-0x20(%r30), p192d
   478  	extrd,u		m224, 31, 32, ma256
   479  	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
   480  
   481  	depd		m288, 31, 32, ma256
   482  	ldd		-0x88(%r30), p256d
   483  	add		climb, p000a, s000
   484  	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
   485  
   486  	add,dc		p064a, p064b, s064
   487  	ldd		0(rp), r000
   488  	add,dc		p128b, p128c, s128
   489  	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
   490  
   491  	add,dc		p192c, p192d, s192
   492  	ldd		8(rp), r064
   493  	add,dc		p256d, %r0, climb
   494  	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
   495  
   496  	ldd		16(rp), r128
   497  	add		ma000, s000, s000	C accum mid 0
   498  	ldd		24(rp), r192
   499  	add,dc		ma064, s064, s064	C accum mid 1
   500  
   501  	add,dc		ma128, s128, s128	C accum mid 2
   502  	fldd		0(up), %fr4
   503  	add,dc		ma192, s192, s192	C accum mid 3
   504  	fldd		8(up), %fr5
   505  
   506  	add,dc		ma256, climb, climb
   507  	fldd		16(up), %fr6
   508  	add		r000, s000, s000	C accum rlimb 0
   509  	fldd		24(up), %fr7
   510  
   511  	add,dc		r064, s064, s064	C accum rlimb 1
   512  	add,dc		r128, s128, s128	C accum rlimb 2
   513  	std		s000, 0(rp)
   514  
   515  	add,dc		r192, s192, s192	C accum rlimb 3
   516  	add,dc		%r0, climb, climb
   517  	std		s064, 8(rp)
   518  
   519  	xmpyu		%fr8R, %fr4L, %fr22
   520  	ldd		-0x78(%r30), p032a1
   521  	xmpyu		%fr8L, %fr4R, %fr23
   522  	std		s128, 16(rp)
   523  
   524  	xmpyu		%fr8R, %fr5L, %fr24
   525  	ldd		-0x70(%r30), p032a2
   526  	xmpyu		%fr8L, %fr5R, %fr25
   527  	std		s192, 24(rp)
   528  
   529  	xmpyu		%fr8R, %fr6L, %fr26
   530  	ldd		-0x38(%r30), p096b1
   531  	xmpyu		%fr8L, %fr6R, %fr27
   532  	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
   533  
   534  	xmpyu		%fr8R, %fr7L, %fr28
   535  	ldd		-0x30(%r30), p096b2
   536  	xmpyu		%fr8L, %fr7R, %fr29
   537  	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
   538  
   539  	xmpyu		%fr8R, %fr4R, %fr30
   540  	ldd		-0x58(%r30), p160c1
   541  	xmpyu		%fr8L, %fr4L, %fr31
   542  	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
   543  
   544  	xmpyu		%fr8R, %fr5R, %fr22
   545  	ldd		-0x50(%r30), p160c2
   546  	xmpyu		%fr8L, %fr5L, %fr23
   547  	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
   548  
   549  	xmpyu		%fr8R, %fr6R, %fr24
   550  	ldd		-0x18(%r30), p224d1
   551  	xmpyu		%fr8L, %fr6L, %fr25
   552  	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
   553  
   554  	xmpyu		%fr8R, %fr7R, %fr26
   555  	ldd		-0x10(%r30), p224d2
   556  	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
   557  	xmpyu		%fr8L, %fr7L, %fr27
   558  
   559  	addib,<>	-1, n, L(loop)
   560  	ldo		32(rp), rp
   561  
   562  LDEF(end2)
   563  	add		p032a1, p032a2, m032
   564  	ldd		-0x80(%r30), p000a
   565  	add,dc		p096b1, p096b2, m096
   566  	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
   567  	add,dc		p160c1, p160c2, m160
   568  	ldd		-0x68(%r30), p064a
   569  	add,dc		p224d1, p224d2, m224
   570  	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
   571  	add,dc		%r0, %r0, m288
   572  	ldd		-0x40(%r30), p064b
   573  	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
   574  	depd,z		m032, 31, 32, ma000
   575  	ldd		-0x28(%r30), p128b
   576  	extrd,u		m032, 31, 32, ma064
   577  	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
   578  	depd		m096, 31, 32, ma064
   579  	ldd		-0x60(%r30), p128c
   580  	extrd,u		m096, 31, 32, ma128
   581  	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
   582  	depd		m160, 31, 32, ma128
   583  	ldd		-0x48(%r30), p192c
   584  	extrd,u		m160, 31, 32, ma192
   585  	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
   586  	depd		m224, 31, 32, ma192
   587  	ldd		-0x20(%r30), p192d
   588  	extrd,u		m224, 31, 32, ma256
   589  	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
   590  	depd		m288, 31, 32, ma256
   591  	ldd		-0x88(%r30), p256d
   592  	add		climb, p000a, s000
   593  	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
   594  	add,dc		p064a, p064b, s064
   595  	ldd		0(rp), r000
   596  	add,dc		p128b, p128c, s128
   597  	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
   598  	add,dc		p192c, p192d, s192
   599  	ldd		8(rp), r064
   600  	add,dc		p256d, %r0, climb
   601  	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
   602  	ldd		16(rp), r128
   603  	add		ma000, s000, s000	C accum mid 0
   604  	ldd		24(rp), r192
   605  	add,dc		ma064, s064, s064	C accum mid 1
   606  	add,dc		ma128, s128, s128	C accum mid 2
   607  	add,dc		ma192, s192, s192	C accum mid 3
   608  	add,dc		ma256, climb, climb
   609  	add		r000, s000, s000	C accum rlimb 0
   610  	add,dc		r064, s064, s064	C accum rlimb 1
   611  	add,dc		r128, s128, s128	C accum rlimb 2
   612  	std		s000, 0(rp)
   613  	add,dc		r192, s192, s192	C accum rlimb 3
   614  	add,dc		%r0, climb, climb
   615  	std		s064, 8(rp)
   616  	ldd		-0x78(%r30), p032a1
   617  	std		s128, 16(rp)
   618  	ldd		-0x70(%r30), p032a2
   619  	std		s192, 24(rp)
   620  	ldd		-0x38(%r30), p096b1
   621  	ldd		-0x30(%r30), p096b2
   622  	ldd		-0x58(%r30), p160c1
   623  	ldd		-0x50(%r30), p160c2
   624  	ldd		-0x18(%r30), p224d1
   625  	ldd		-0x10(%r30), p224d2
   626  	ldo		32(rp), rp
   627  
   628  LDEF(end1)
   629  	add		p032a1, p032a2, m032
   630  	ldd		-0x80(%r30), p000a
   631  	add,dc		p096b1, p096b2, m096
   632  	add,dc		p160c1, p160c2, m160
   633  	ldd		-0x68(%r30), p064a
   634  	add,dc		p224d1, p224d2, m224
   635  	add,dc		%r0, %r0, m288
   636  	ldd		-0x40(%r30), p064b
   637  	depd,z		m032, 31, 32, ma000
   638  	ldd		-0x28(%r30), p128b
   639  	extrd,u		m032, 31, 32, ma064
   640  	depd		m096, 31, 32, ma064
   641  	ldd		-0x60(%r30), p128c
   642  	extrd,u		m096, 31, 32, ma128
   643  	depd		m160, 31, 32, ma128
   644  	ldd		-0x48(%r30), p192c
   645  	extrd,u		m160, 31, 32, ma192
   646  	depd		m224, 31, 32, ma192
   647  	ldd		-0x20(%r30), p192d
   648  	extrd,u		m224, 31, 32, ma256
   649  	depd		m288, 31, 32, ma256
   650  	ldd		-0x88(%r30), p256d
   651  	add		climb, p000a, s000
   652  	add,dc		p064a, p064b, s064
   653  	ldd		0(rp), r000
   654  	add,dc		p128b, p128c, s128
   655  	add,dc		p192c, p192d, s192
   656  	ldd		8(rp), r064
   657  	add,dc		p256d, %r0, climb
   658  	ldd		16(rp), r128
   659  	add		ma000, s000, s000	C accum mid 0
   660  	ldd		24(rp), r192
   661  	add,dc		ma064, s064, s064	C accum mid 1
   662  	add,dc		ma128, s128, s128	C accum mid 2
   663  	add,dc		ma192, s192, s192	C accum mid 3
   664  	add,dc		ma256, climb, climb
   665  	add		r000, s000, s000	C accum rlimb 0
   666  	add,dc		r064, s064, s064	C accum rlimb 1
   667  	add,dc		r128, s128, s128	C accum rlimb 2
   668  	std		s000, 0(rp)
   669  	add,dc		r192, s192, s192	C accum rlimb 3
   670  	add,dc		%r0, climb, climb
   671  	std		s064, 8(rp)
   672  	std		s128, 16(rp)
   673  	std		s192, 24(rp)
   674  
   675  	ldd		-0xb0(%r30), %r13
   676  	ldd		-0xb8(%r30), %r12
   677  	ldd		-0xc0(%r30), %r11
   678  	ldd		-0xc8(%r30), %r10
   679  	ldd		-0xd0(%r30), %r9
   680  	ldd		-0xd8(%r30), %r8
   681  	ldd		-0xe0(%r30), %r7
   682  	ldd		-0xe8(%r30), %r6
   683  LDEF(done)
   684  ifdef(`HAVE_ABI_2_0w',
   685  `	copy		climb, %r28
   686  ',`	extrd,u		climb, 63, 32, %r29
   687  	extrd,u		climb, 31, 32, %r28
   688  ')
   689  	ldd		-0xf0(%r30), %r5
   690  	ldd		-0xf8(%r30), %r4
   691  	bve		(%r2)
   692  	ldd,mb		-0x100(%r30), %r3
   693  EPILOGUE(mpn_addmul_1)