github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/pa64/submul_1.asm (about)

     1  dnl  HP-PA 2.0 64-bit mpn_submul_1 -- Multiply a limb vector with a limb and
     2  dnl  subtract the result from a second limb vector.
     3  
     4  dnl  Copyright 1998-2000, 2002, 2003 Free Software Foundation, Inc.
     5  
     6  dnl  This file is part of the GNU MP Library.
     7  dnl
     8  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     9  dnl  it under the terms of either:
    10  dnl
    11  dnl    * the GNU Lesser General Public License as published by the Free
    12  dnl      Software Foundation; either version 3 of the License, or (at your
    13  dnl      option) any later version.
    14  dnl
    15  dnl  or
    16  dnl
    17  dnl    * the GNU General Public License as published by the Free Software
    18  dnl      Foundation; either version 2 of the License, or (at your option) any
    19  dnl      later version.
    20  dnl
    21  dnl  or both in parallel, as here.
    22  dnl
    23  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    24  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    25  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    26  dnl  for more details.
    27  dnl
    28  dnl  You should have received copies of the GNU General Public License and the
    29  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    30  dnl  see https://www.gnu.org/licenses/.
    31  
    32  include(`../config.m4')
    33  
    34  C		    cycles/limb
    35  C 8000,8200:		7
    36  C 8500,8600,8700:	6.5
    37  
    38  C  The feed-in and wind-down code has not yet been scheduled.  Many cycles
    39  C  could be saved there per call.
    40  
    41  C  DESCRIPTION:
    42  C  The main loop "BIG" is 4-way unrolled, mainly to allow
    43  C  effective use of ADD,DC.  Delays in moving data via the cache from the FP
    44  C  registers to the IU registers, have demanded a deep software pipeline, and
    45  C  a lot of stack slots for partial products in flight.
    46  C
    47  C  CODE STRUCTURE:
    48  C  save-some-registers
    49  C  do 0, 1, 2, or 3 limbs
    50  C  if done, restore-some-regs and return
    51  C  save-many-regs
    52  C  do 4, 8, ... limb
    53  C  restore-all-regs
    54  
    55  C  STACK LAYOUT:
    56  C  HP-PA stack grows upwards.  We could allocate 8 fewer slots by using the
    57  C  slots marked FREE, as well as some slots in the caller's "frame marker".
    58  C
    59  C -00 <- r30
    60  C -08  FREE
    61  C -10  tmp
    62  C -18  tmp
    63  C -20  tmp
    64  C -28  tmp
    65  C -30  tmp
    66  C -38  tmp
    67  C -40  tmp
    68  C -48  tmp
    69  C -50  tmp
    70  C -58  tmp
    71  C -60  tmp
    72  C -68  tmp
    73  C -70  tmp
    74  C -78  tmp
    75  C -80  tmp
    76  C -88  tmp
    77  C -90  FREE
    78  C -98  FREE
    79  C -a0  FREE
    80  C -a8  FREE
    81  C -b0  r13
    82  C -b8  r12
    83  C -c0  r11
    84  C -c8  r10
    85  C -d0  r8
    86  C -d8  r8
    87  C -e0  r7
    88  C -e8  r6
    89  C -f0  r5
    90  C -f8  r4
    91  C -100 r3
    92  C  Previous frame:
    93  C  [unused area]
    94  C -38/-138 vlimb home slot.  For 2.0N, the vlimb arg will arrive here.
    95  
    96  
    97  include(`../config.m4')
    98  
    99  C INPUT PARAMETERS:
   100  define(`rp',`%r26')	C
   101  define(`up',`%r25')	C
   102  define(`n',`%r24')	C
   103  define(`vlimb',`%r23')	C
   104  
   105  define(`climb',`%r23')	C
   106  
   107  ifdef(`HAVE_ABI_2_0w',
   108  `	.level	2.0w
   109  ',`	.level	2.0
   110  ')
   111  PROLOGUE(mpn_submul_1)
   112  
   113  ifdef(`HAVE_ABI_2_0w',
   114  `	std		vlimb, -0x38(%r30)	C store vlimb into "home" slot
   115  ')
   116  	std,ma		%r3, 0x100(%r30)
   117  	std		%r4, -0xf8(%r30)
   118  	std		%r5, -0xf0(%r30)
   119  	ldo		0(%r0), climb		C clear climb
   120  	fldd		-0x138(%r30), %fr8	C put vlimb in fp register
   121  
   122  define(`p032a1',`%r1')	C
   123  define(`p032a2',`%r19')	C
   124  
   125  define(`m032',`%r20')	C
   126  define(`m096',`%r21')	C
   127  
   128  define(`p000a',`%r22')	C
   129  define(`p064a',`%r29')	C
   130  
   131  define(`s000',`%r31')	C
   132  
   133  define(`ma000',`%r4')	C
   134  define(`ma064',`%r20')	C
   135  
   136  define(`r000',`%r3')	C
   137  
   138  	extrd,u		n, 63, 2, %r5
   139  	cmpb,=		%r5, %r0, L(BIG)
   140  	nop
   141  
   142  	fldd		0(up), %fr4
   143  	ldo		8(up), up
   144  	xmpyu		%fr8R, %fr4L, %fr22
   145  	xmpyu		%fr8L, %fr4R, %fr23
   146  	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
   147  	xmpyu		%fr8R, %fr4R, %fr24
   148  	xmpyu		%fr8L, %fr4L, %fr25
   149  	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
   150  	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
   151  	addib,<>	-1, %r5, L(two_or_more)
   152  	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
   153  LDEF(one)
   154  	ldd		-0x78(%r30), p032a1
   155  	ldd		-0x70(%r30), p032a2
   156  	ldd		-0x80(%r30), p000a
   157  	b		L(0_one_out)
   158  	ldd		-0x68(%r30), p064a
   159  
   160  LDEF(two_or_more)
   161  	fldd		0(up), %fr4
   162  	ldo		8(up), up
   163  	xmpyu		%fr8R, %fr4L, %fr22
   164  	xmpyu		%fr8L, %fr4R, %fr23
   165  	ldd		-0x78(%r30), p032a1
   166  	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
   167  	xmpyu		%fr8R, %fr4R, %fr24
   168  	xmpyu		%fr8L, %fr4L, %fr25
   169  	ldd		-0x70(%r30), p032a2
   170  	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
   171  	ldd		-0x80(%r30), p000a
   172  	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
   173  	ldd		-0x68(%r30), p064a
   174  	addib,<>	-1, %r5, L(three_or_more)
   175  	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
   176  LDEF(two)
   177  	add		p032a1, p032a2, m032
   178  	add,dc		%r0, %r0, m096
   179  	depd,z		m032, 31, 32, ma000
   180  	extrd,u		m032, 31, 32, ma064
   181  	ldd		0(rp), r000
   182  	b		L(0_two_out)
   183  	depd		m096, 31, 32, ma064
   184  
   185  LDEF(three_or_more)
   186  	fldd		0(up), %fr4
   187  	add		p032a1, p032a2, m032
   188  	add,dc		%r0, %r0, m096
   189  	depd,z		m032, 31, 32, ma000
   190  	extrd,u		m032, 31, 32, ma064
   191  	ldd		0(rp), r000
   192  C	addib,=		-1, %r5, L(0_out)
   193  	depd		m096, 31, 32, ma064
   194  LDEF(loop0)
   195  C	xmpyu		%fr8R, %fr4L, %fr22
   196  C	xmpyu		%fr8L, %fr4R, %fr23
   197  C	ldd		-0x78(%r30), p032a1
   198  C	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
   199  C
   200  C	xmpyu		%fr8R, %fr4R, %fr24
   201  C	xmpyu		%fr8L, %fr4L, %fr25
   202  C	ldd		-0x70(%r30), p032a2
   203  C	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
   204  C
   205  C	ldo		8(rp), rp
   206  C	add		climb, p000a, s000
   207  C	ldd		-0x80(%r30), p000a
   208  C	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
   209  C
   210  C	add,dc		p064a, %r0, climb
   211  C	ldo		8(up), up
   212  C	ldd		-0x68(%r30), p064a
   213  C	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
   214  C
   215  C	add		ma000, s000, s000
   216  C	add,dc		ma064, climb, climb
   217  C	fldd		0(up), %fr4
   218  C
   219  C	sub		r000, s000, s000
   220  C	sub,db		%r0, climb, climb
   221  C	sub		%r0, climb, climb
   222  C	std		s000, -8(rp)
   223  C
   224  C	add		p032a1, p032a2, m032
   225  C	add,dc		%r0, %r0, m096
   226  C
   227  C	depd,z		m032, 31, 32, ma000
   228  C	extrd,u		m032, 31, 32, ma064
   229  C	ldd		0(rp), r000
   230  C	addib,<>	-1, %r5, L(loop0)
   231  C	depd		m096, 31, 32, ma064
   232  LDEF(0_out)
   233  	ldo		8(up), up
   234  	xmpyu		%fr8R, %fr4L, %fr22
   235  	xmpyu		%fr8L, %fr4R, %fr23
   236  	ldd		-0x78(%r30), p032a1
   237  	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
   238  	xmpyu		%fr8R, %fr4R, %fr24
   239  	xmpyu		%fr8L, %fr4L, %fr25
   240  	ldd		-0x70(%r30), p032a2
   241  	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
   242  	ldo		8(rp), rp
   243  	add		climb, p000a, s000
   244  	ldd		-0x80(%r30), p000a
   245  	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
   246  	add,dc		p064a, %r0, climb
   247  	ldd		-0x68(%r30), p064a
   248  	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
   249  	add		ma000, s000, s000
   250  	add,dc		ma064, climb, climb
   251  	sub		r000, s000, s000
   252  	sub,db		%r0, climb, climb
   253  	sub		%r0, climb, climb
   254  	std		s000, -8(rp)
   255  	add		p032a1, p032a2, m032
   256  	add,dc		%r0, %r0, m096
   257  	depd,z		m032, 31, 32, ma000
   258  	extrd,u		m032, 31, 32, ma064
   259  	ldd		0(rp), r000
   260  	depd		m096, 31, 32, ma064
   261  LDEF(0_two_out)
   262  	ldd		-0x78(%r30), p032a1
   263  	ldd		-0x70(%r30), p032a2
   264  	ldo		8(rp), rp
   265  	add		climb, p000a, s000
   266  	ldd		-0x80(%r30), p000a
   267  	add,dc		p064a, %r0, climb
   268  	ldd		-0x68(%r30), p064a
   269  	add		ma000, s000, s000
   270  	add,dc		ma064, climb, climb
   271  	sub		r000, s000, s000
   272  	sub,db		%r0, climb, climb
   273  	sub		%r0, climb, climb
   274  	std		s000, -8(rp)
   275  LDEF(0_one_out)
   276  	add		p032a1, p032a2, m032
   277  	add,dc		%r0, %r0, m096
   278  	depd,z		m032, 31, 32, ma000
   279  	extrd,u		m032, 31, 32, ma064
   280  	ldd		0(rp), r000
   281  	depd		m096, 31, 32, ma064
   282  
   283  	add		climb, p000a, s000
   284  	add,dc		p064a, %r0, climb
   285  	add		ma000, s000, s000
   286  	add,dc		ma064, climb, climb
   287  	sub		r000, s000, s000
   288  	sub,db		%r0, climb, climb
   289  	sub		%r0, climb, climb
   290  	std		s000, 0(rp)
   291  
   292  	cmpib,>=	4, n, L(done)
   293  	ldo		8(rp), rp
   294  
   295  C 4-way unrolled code.
   296  
   297  LDEF(BIG)
   298  
   299  define(`p032a1',`%r1')	C
   300  define(`p032a2',`%r19')	C
   301  define(`p096b1',`%r20')	C
   302  define(`p096b2',`%r21')	C
   303  define(`p160c1',`%r22')	C
   304  define(`p160c2',`%r29')	C
   305  define(`p224d1',`%r31')	C
   306  define(`p224d2',`%r3')	C
   307  			C
   308  define(`m032',`%r4')	C
   309  define(`m096',`%r5')	C
   310  define(`m160',`%r6')	C
   311  define(`m224',`%r7')	C
   312  define(`m288',`%r8')	C
   313  			C
   314  define(`p000a',`%r1')	C
   315  define(`p064a',`%r19')	C
   316  define(`p064b',`%r20')	C
   317  define(`p128b',`%r21')	C
   318  define(`p128c',`%r22')	C
   319  define(`p192c',`%r29')	C
   320  define(`p192d',`%r31')	C
   321  define(`p256d',`%r3')	C
   322  			C
   323  define(`s000',`%r10')	C
   324  define(`s064',`%r11')	C
   325  define(`s128',`%r12')	C
   326  define(`s192',`%r13')	C
   327  			C
   328  define(`ma000',`%r9')	C
   329  define(`ma064',`%r4')	C
   330  define(`ma128',`%r5')	C
   331  define(`ma192',`%r6')	C
   332  define(`ma256',`%r7')	C
   333  			C
   334  define(`r000',`%r1')	C
   335  define(`r064',`%r19')	C
   336  define(`r128',`%r20')	C
   337  define(`r192',`%r21')	C
   338  
   339  	std		%r6, -0xe8(%r30)
   340  	std		%r7, -0xe0(%r30)
   341  	std		%r8, -0xd8(%r30)
   342  	std		%r9, -0xd0(%r30)
   343  	std		%r10, -0xc8(%r30)
   344  	std		%r11, -0xc0(%r30)
   345  	std		%r12, -0xb8(%r30)
   346  	std		%r13, -0xb0(%r30)
   347  
   348  ifdef(`HAVE_ABI_2_0w',
   349  `	extrd,u		n, 61, 62, n		C right shift 2
   350  ',`	extrd,u		n, 61, 30, n		C right shift 2, zero extend
   351  ')
   352  
   353  LDEF(4_or_more)
   354  	fldd		0(up), %fr4
   355  	fldd		8(up), %fr5
   356  	fldd		16(up), %fr6
   357  	fldd		24(up), %fr7
   358  	xmpyu		%fr8R, %fr4L, %fr22
   359  	xmpyu		%fr8L, %fr4R, %fr23
   360  	xmpyu		%fr8R, %fr5L, %fr24
   361  	xmpyu		%fr8L, %fr5R, %fr25
   362  	xmpyu		%fr8R, %fr6L, %fr26
   363  	xmpyu		%fr8L, %fr6R, %fr27
   364  	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
   365  	xmpyu		%fr8R, %fr7L, %fr28
   366  	xmpyu		%fr8L, %fr7R, %fr29
   367  	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
   368  	xmpyu		%fr8R, %fr4R, %fr30
   369  	xmpyu		%fr8L, %fr4L, %fr31
   370  	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
   371  	xmpyu		%fr8R, %fr5R, %fr22
   372  	xmpyu		%fr8L, %fr5L, %fr23
   373  	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
   374  	xmpyu		%fr8R, %fr6R, %fr24
   375  	xmpyu		%fr8L, %fr6L, %fr25
   376  	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
   377  	xmpyu		%fr8R, %fr7R, %fr26
   378  	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
   379  	addib,<>	-1, n, L(8_or_more)
   380  	xmpyu		%fr8L, %fr7L, %fr27
   381  	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
   382  	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
   383  	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
   384  	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
   385  	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
   386  	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
   387  	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
   388  	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
   389  	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
   390  	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
   391  	ldd		-0x78(%r30), p032a1
   392  	ldd		-0x70(%r30), p032a2
   393  	ldd		-0x38(%r30), p096b1
   394  	ldd		-0x30(%r30), p096b2
   395  	ldd		-0x58(%r30), p160c1
   396  	ldd		-0x50(%r30), p160c2
   397  	ldd		-0x18(%r30), p224d1
   398  	ldd		-0x10(%r30), p224d2
   399  	b		L(end1)
   400  	nop
   401  
   402  LDEF(8_or_more)
   403  	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
   404  	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
   405  	ldo		32(up), up
   406  	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
   407  	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
   408  	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
   409  	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
   410  	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
   411  	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
   412  	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
   413  	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
   414  	fldd		0(up), %fr4
   415  	fldd		8(up), %fr5
   416  	fldd		16(up), %fr6
   417  	fldd		24(up), %fr7
   418  	xmpyu		%fr8R, %fr4L, %fr22
   419  	ldd		-0x78(%r30), p032a1
   420  	xmpyu		%fr8L, %fr4R, %fr23
   421  	xmpyu		%fr8R, %fr5L, %fr24
   422  	ldd		-0x70(%r30), p032a2
   423  	xmpyu		%fr8L, %fr5R, %fr25
   424  	xmpyu		%fr8R, %fr6L, %fr26
   425  	ldd		-0x38(%r30), p096b1
   426  	xmpyu		%fr8L, %fr6R, %fr27
   427  	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
   428  	xmpyu		%fr8R, %fr7L, %fr28
   429  	ldd		-0x30(%r30), p096b2
   430  	xmpyu		%fr8L, %fr7R, %fr29
   431  	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
   432  	xmpyu		%fr8R, %fr4R, %fr30
   433  	ldd		-0x58(%r30), p160c1
   434  	xmpyu		%fr8L, %fr4L, %fr31
   435  	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
   436  	xmpyu		%fr8R, %fr5R, %fr22
   437  	ldd		-0x50(%r30), p160c2
   438  	xmpyu		%fr8L, %fr5L, %fr23
   439  	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
   440  	xmpyu		%fr8R, %fr6R, %fr24
   441  	ldd		-0x18(%r30), p224d1
   442  	xmpyu		%fr8L, %fr6L, %fr25
   443  	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
   444  	xmpyu		%fr8R, %fr7R, %fr26
   445  	ldd		-0x10(%r30), p224d2
   446  	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
   447  	addib,=		-1, n, L(end2)
   448  	xmpyu		%fr8L, %fr7L, %fr27
   449  LDEF(loop)
   450  	add		p032a1, p032a2, m032
   451  	ldd		-0x80(%r30), p000a
   452  	add,dc		p096b1, p096b2, m096
   453  	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
   454  
   455  	add,dc		p160c1, p160c2, m160
   456  	ldd		-0x68(%r30), p064a
   457  	add,dc		p224d1, p224d2, m224
   458  	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
   459  
   460  	add,dc		%r0, %r0, m288
   461  	ldd		-0x40(%r30), p064b
   462  	ldo		32(up), up
   463  	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
   464  
   465  	depd,z		m032, 31, 32, ma000
   466  	ldd		-0x28(%r30), p128b
   467  	extrd,u		m032, 31, 32, ma064
   468  	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
   469  
   470  	depd		m096, 31, 32, ma064
   471  	ldd		-0x60(%r30), p128c
   472  	extrd,u		m096, 31, 32, ma128
   473  	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
   474  
   475  	depd		m160, 31, 32, ma128
   476  	ldd		-0x48(%r30), p192c
   477  	extrd,u		m160, 31, 32, ma192
   478  	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
   479  
   480  	depd		m224, 31, 32, ma192
   481  	ldd		-0x20(%r30), p192d
   482  	extrd,u		m224, 31, 32, ma256
   483  	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
   484  
   485  	depd		m288, 31, 32, ma256
   486  	ldd		-0x88(%r30), p256d
   487  	add		climb, p000a, s000
   488  	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
   489  
   490  	add,dc		p064a, p064b, s064
   491  	ldd		0(rp), r000
   492  	add,dc		p128b, p128c, s128
   493  	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
   494  
   495  	add,dc		p192c, p192d, s192
   496  	ldd		8(rp), r064
   497  	add,dc		p256d, %r0, climb
   498  	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
   499  
   500  	ldd		16(rp), r128
   501  	add		ma000, s000, s000	C accum mid 0
   502  	ldd		24(rp), r192
   503  	add,dc		ma064, s064, s064	C accum mid 1
   504  
   505  	add,dc		ma128, s128, s128	C accum mid 2
   506  	fldd		0(up), %fr4
   507  	add,dc		ma192, s192, s192	C accum mid 3
   508  	fldd		8(up), %fr5
   509  
   510  	add,dc		ma256, climb, climb
   511  	fldd		16(up), %fr6
   512  	sub		r000, s000, s000	C accum rlimb 0
   513  	fldd		24(up), %fr7
   514  
   515  	sub,db		r064, s064, s064	C accum rlimb 1
   516  	sub,db		r128, s128, s128	C accum rlimb 2
   517  	std		s000, 0(rp)
   518  
   519  	sub,db		r192, s192, s192	C accum rlimb 3
   520  	sub,db		%r0, climb, climb
   521  	sub		%r0, climb, climb
   522  	std		s064, 8(rp)
   523  
   524  	xmpyu		%fr8R, %fr4L, %fr22
   525  	ldd		-0x78(%r30), p032a1
   526  	xmpyu		%fr8L, %fr4R, %fr23
   527  	std		s128, 16(rp)
   528  
   529  	xmpyu		%fr8R, %fr5L, %fr24
   530  	ldd		-0x70(%r30), p032a2
   531  	xmpyu		%fr8L, %fr5R, %fr25
   532  	std		s192, 24(rp)
   533  
   534  	xmpyu		%fr8R, %fr6L, %fr26
   535  	ldd		-0x38(%r30), p096b1
   536  	xmpyu		%fr8L, %fr6R, %fr27
   537  	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
   538  
   539  	xmpyu		%fr8R, %fr7L, %fr28
   540  	ldd		-0x30(%r30), p096b2
   541  	xmpyu		%fr8L, %fr7R, %fr29
   542  	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
   543  
   544  	xmpyu		%fr8R, %fr4R, %fr30
   545  	ldd		-0x58(%r30), p160c1
   546  	xmpyu		%fr8L, %fr4L, %fr31
   547  	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
   548  
   549  	xmpyu		%fr8R, %fr5R, %fr22
   550  	ldd		-0x50(%r30), p160c2
   551  	xmpyu		%fr8L, %fr5L, %fr23
   552  	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
   553  
   554  	xmpyu		%fr8R, %fr6R, %fr24
   555  	ldd		-0x18(%r30), p224d1
   556  	xmpyu		%fr8L, %fr6L, %fr25
   557  	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
   558  
   559  	xmpyu		%fr8R, %fr7R, %fr26
   560  	ldd		-0x10(%r30), p224d2
   561  	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
   562  	xmpyu		%fr8L, %fr7L, %fr27
   563  
   564  	addib,<>	-1, n, L(loop)
   565  	ldo		32(rp), rp
   566  
   567  LDEF(end2)
   568  	add		p032a1, p032a2, m032
   569  	ldd		-0x80(%r30), p000a
   570  	add,dc		p096b1, p096b2, m096
   571  	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
   572  	add,dc		p160c1, p160c2, m160
   573  	ldd		-0x68(%r30), p064a
   574  	add,dc		p224d1, p224d2, m224
   575  	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
   576  	add,dc		%r0, %r0, m288
   577  	ldd		-0x40(%r30), p064b
   578  	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
   579  	depd,z		m032, 31, 32, ma000
   580  	ldd		-0x28(%r30), p128b
   581  	extrd,u		m032, 31, 32, ma064
   582  	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
   583  	depd		m096, 31, 32, ma064
   584  	ldd		-0x60(%r30), p128c
   585  	extrd,u		m096, 31, 32, ma128
   586  	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
   587  	depd		m160, 31, 32, ma128
   588  	ldd		-0x48(%r30), p192c
   589  	extrd,u		m160, 31, 32, ma192
   590  	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
   591  	depd		m224, 31, 32, ma192
   592  	ldd		-0x20(%r30), p192d
   593  	extrd,u		m224, 31, 32, ma256
   594  	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
   595  	depd		m288, 31, 32, ma256
   596  	ldd		-0x88(%r30), p256d
   597  	add		climb, p000a, s000
   598  	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
   599  	add,dc		p064a, p064b, s064
   600  	ldd		0(rp), r000
   601  	add,dc		p128b, p128c, s128
   602  	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
   603  	add,dc		p192c, p192d, s192
   604  	ldd		8(rp), r064
   605  	add,dc		p256d, %r0, climb
   606  	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
   607  	ldd		16(rp), r128
   608  	add		ma000, s000, s000	C accum mid 0
   609  	ldd		24(rp), r192
   610  	add,dc		ma064, s064, s064	C accum mid 1
   611  	add,dc		ma128, s128, s128	C accum mid 2
   612  	add,dc		ma192, s192, s192	C accum mid 3
   613  	add,dc		ma256, climb, climb
   614  	sub		r000, s000, s000	C accum rlimb 0
   615  	sub,db		r064, s064, s064	C accum rlimb 1
   616  	sub,db		r128, s128, s128	C accum rlimb 2
   617  	std		s000, 0(rp)
   618  	sub,db		r192, s192, s192	C accum rlimb 3
   619  	sub,db		%r0, climb, climb
   620  	sub		%r0, climb, climb
   621  	std		s064, 8(rp)
   622  	ldd		-0x78(%r30), p032a1
   623  	std		s128, 16(rp)
   624  	ldd		-0x70(%r30), p032a2
   625  	std		s192, 24(rp)
   626  	ldd		-0x38(%r30), p096b1
   627  	ldd		-0x30(%r30), p096b2
   628  	ldd		-0x58(%r30), p160c1
   629  	ldd		-0x50(%r30), p160c2
   630  	ldd		-0x18(%r30), p224d1
   631  	ldd		-0x10(%r30), p224d2
   632  	ldo		32(rp), rp
   633  
   634  LDEF(end1)
   635  	add		p032a1, p032a2, m032
   636  	ldd		-0x80(%r30), p000a
   637  	add,dc		p096b1, p096b2, m096
   638  	add,dc		p160c1, p160c2, m160
   639  	ldd		-0x68(%r30), p064a
   640  	add,dc		p224d1, p224d2, m224
   641  	add,dc		%r0, %r0, m288
   642  	ldd		-0x40(%r30), p064b
   643  	depd,z		m032, 31, 32, ma000
   644  	ldd		-0x28(%r30), p128b
   645  	extrd,u		m032, 31, 32, ma064
   646  	depd		m096, 31, 32, ma064
   647  	ldd		-0x60(%r30), p128c
   648  	extrd,u		m096, 31, 32, ma128
   649  	depd		m160, 31, 32, ma128
   650  	ldd		-0x48(%r30), p192c
   651  	extrd,u		m160, 31, 32, ma192
   652  	depd		m224, 31, 32, ma192
   653  	ldd		-0x20(%r30), p192d
   654  	extrd,u		m224, 31, 32, ma256
   655  	depd		m288, 31, 32, ma256
   656  	ldd		-0x88(%r30), p256d
   657  	add		climb, p000a, s000
   658  	add,dc		p064a, p064b, s064
   659  	ldd		0(rp), r000
   660  	add,dc		p128b, p128c, s128
   661  	add,dc		p192c, p192d, s192
   662  	ldd		8(rp), r064
   663  	add,dc		p256d, %r0, climb
   664  	ldd		16(rp), r128
   665  	add		ma000, s000, s000	C accum mid 0
   666  	ldd		24(rp), r192
   667  	add,dc		ma064, s064, s064	C accum mid 1
   668  	add,dc		ma128, s128, s128	C accum mid 2
   669  	add,dc		ma192, s192, s192	C accum mid 3
   670  	add,dc		ma256, climb, climb
   671  	sub		r000, s000, s000	C accum rlimb 0
   672  	sub,db		r064, s064, s064	C accum rlimb 1
   673  	sub,db		r128, s128, s128	C accum rlimb 2
   674  	std		s000, 0(rp)
   675  	sub,db		r192, s192, s192	C accum rlimb 3
   676  	sub,db		%r0, climb, climb
   677  	sub		%r0, climb, climb
   678  	std		s064, 8(rp)
   679  	std		s128, 16(rp)
   680  	std		s192, 24(rp)
   681  
   682  	ldd		-0xb0(%r30), %r13
   683  	ldd		-0xb8(%r30), %r12
   684  	ldd		-0xc0(%r30), %r11
   685  	ldd		-0xc8(%r30), %r10
   686  	ldd		-0xd0(%r30), %r9
   687  	ldd		-0xd8(%r30), %r8
   688  	ldd		-0xe0(%r30), %r7
   689  	ldd		-0xe8(%r30), %r6
   690  LDEF(done)
   691  ifdef(`HAVE_ABI_2_0w',
   692  `	copy		climb, %r28
   693  ',`	extrd,u		climb, 63, 32, %r29
   694  	extrd,u		climb, 31, 32, %r28
   695  ')
   696  	ldd		-0xf0(%r30), %r5
   697  	ldd		-0xf8(%r30), %r4
   698  	bve		(%r2)
   699  	ldd,mb		-0x100(%r30), %r3
   700  EPILOGUE(mpn_submul_1)