github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/pa64/mul_1.asm (about)

     1  dnl  HP-PA 2.0 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and store
     2  dnl  the result in a second limb vector.
     3  
     4  dnl  Copyright 1998-2000, 2002, 2003 Free Software Foundation, Inc.
     5  
     6  dnl  This file is part of the GNU MP Library.
     7  dnl
     8  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     9  dnl  it under the terms of either:
    10  dnl
    11  dnl    * the GNU Lesser General Public License as published by the Free
    12  dnl      Software Foundation; either version 3 of the License, or (at your
    13  dnl      option) any later version.
    14  dnl
    15  dnl  or
    16  dnl
    17  dnl    * the GNU General Public License as published by the Free Software
    18  dnl      Foundation; either version 2 of the License, or (at your option) any
    19  dnl      later version.
    20  dnl
    21  dnl  or both in parallel, as here.
    22  dnl
    23  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    24  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    25  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    26  dnl  for more details.
    27  dnl
    28  dnl  You should have received copies of the GNU General Public License and the
    29  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    30  dnl  see https://www.gnu.org/licenses/.
    31  
    32  include(`../config.m4')
    33  
    34  C		    cycles/limb
    35  C 8000,8200:		6.5
    36  C 8500,8600,8700:	5.625
    37  
    38  C  The feed-in and wind-down code has not yet been scheduled.  Many cycles
    39  C  could be saved there per call.
    40  
    41  C  DESCRIPTION:
    42  C  The main loop "BIG" is 4-way unrolled, mainly to allow
    43  C  effective use of ADD,DC.  Delays in moving data via the cache from the FP
    44  C  registers to the IU registers, have demanded a deep software pipeline, and
    45  C  a lot of stack slots for partial products in flight.
    46  C
    47  C  CODE STRUCTURE:
    48  C  save-some-registers
    49  C  do 0, 1, 2, or 3 limbs
    50  C  if done, restore-some-regs and return
    51  C  save-many-regs
    52  C  do 4, 8, ... limb
    53  C  restore-all-regs
    54  
    55  C  STACK LAYOUT:
    56  C  HP-PA stack grows upwards.  We could allocate 8 fewer slots by using the
    57  C  slots marked FREE, as well as some slots in the caller's "frame marker".
    58  C
    59  C -00 <- r30
    60  C -08  FREE
    61  C -10  tmp
    62  C -18  tmp
    63  C -20  tmp
    64  C -28  tmp
    65  C -30  tmp
    66  C -38  tmp
    67  C -40  tmp
    68  C -48  tmp
    69  C -50  tmp
    70  C -58  tmp
    71  C -60  tmp
    72  C -68  tmp
    73  C -70  tmp
    74  C -78  tmp
    75  C -80  tmp
    76  C -88  tmp
    77  C -90  FREE
    78  C -98  FREE
    79  C -a0  FREE
    80  C -a8  FREE
    81  C -b0  r13
    82  C -b8  r12
    83  C -c0  r11
    84  C -c8  r10
    85  C -d0  r8
    86  C -d8  r8
    87  C -e0  r7
    88  C -e8  r6
    89  C -f0  r5
    90  C -f8  r4
    91  C -100 r3
    92  C  Previous frame:
    93  C  [unused area]
    94  C -38/-138 vlimb home slot.  For 2.0N, the vlimb arg will arrive here.
    95  
    96  
    97  include(`../config.m4')
    98  
    99  C INPUT PARAMETERS:
   100  define(`rp',`%r26')	C
   101  define(`up',`%r25')	C
   102  define(`n',`%r24')	C
   103  define(`vlimb',`%r23')	C
   104  
   105  define(`climb',`%r23')	C
   106  
   107  ifdef(`HAVE_ABI_2_0w',
   108  `	.level	2.0w
   109  ',`	.level	2.0
   110  ')
   111  PROLOGUE(mpn_mul_1)
   112  
   113  ifdef(`HAVE_ABI_2_0w',
   114  `	std		vlimb, -0x38(%r30)	C store vlimb into "home" slot
   115  ')
   116  	std,ma		%r3, 0x100(%r30)
   117  	std		%r4, -0xf8(%r30)
   118  	std		%r5, -0xf0(%r30)
   119  	ldo		0(%r0), climb		C clear climb
   120  	fldd		-0x138(%r30), %fr8	C put vlimb in fp register
   121  
   122  define(`p032a1',`%r1')	C
   123  define(`p032a2',`%r19')	C
   124  
   125  define(`m032',`%r20')	C
   126  define(`m096',`%r21')	C
   127  
   128  define(`p000a',`%r22')	C
   129  define(`p064a',`%r29')	C
   130  
   131  define(`s000',`%r31')	C
   132  
   133  define(`ma000',`%r4')	C
   134  define(`ma064',`%r20')	C
   135  
   136  C define(`r000',`%r3')	C	FIXME don't save r3 for n < 4.
   137  
   138  	extrd,u		n, 63, 2, %r5
   139  	cmpb,=		%r5, %r0, L(BIG)
   140  	nop
   141  
   142  	fldd		0(up), %fr4
   143  	ldo		8(up), up
   144  	xmpyu		%fr8R, %fr4L, %fr22
   145  	xmpyu		%fr8L, %fr4R, %fr23
   146  	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
   147  	xmpyu		%fr8R, %fr4R, %fr24
   148  	xmpyu		%fr8L, %fr4L, %fr25
   149  	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
   150  	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
   151  	addib,<>	-1, %r5, L(two_or_more)
   152  	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
   153  LDEF(one)
   154  	ldd		-0x78(%r30), p032a1
   155  	ldd		-0x70(%r30), p032a2
   156  	ldd		-0x80(%r30), p000a
   157  	b		L(0_one_out)
   158  	ldd		-0x68(%r30), p064a
   159  
   160  LDEF(two_or_more)
   161  	fldd		0(up), %fr4
   162  	ldo		8(up), up
   163  	xmpyu		%fr8R, %fr4L, %fr22
   164  	xmpyu		%fr8L, %fr4R, %fr23
   165  	ldd		-0x78(%r30), p032a1
   166  	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
   167  	xmpyu		%fr8R, %fr4R, %fr24
   168  	xmpyu		%fr8L, %fr4L, %fr25
   169  	ldd		-0x70(%r30), p032a2
   170  	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
   171  	ldd		-0x80(%r30), p000a
   172  	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
   173  	ldd		-0x68(%r30), p064a
   174  	addib,<>	-1, %r5, L(three_or_more)
   175  	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
   176  LDEF(two)
   177  	add		p032a1, p032a2, m032
   178  	add,dc		%r0, %r0, m096
   179  	depd,z		m032, 31, 32, ma000
   180  	extrd,u		m032, 31, 32, ma064
   181  	b		L(0_two_out)
   182  	depd		m096, 31, 32, ma064
   183  
   184  LDEF(three_or_more)
   185  	fldd		0(up), %fr4
   186  	add		p032a1, p032a2, m032
   187  	add,dc		%r0, %r0, m096
   188  	depd,z		m032, 31, 32, ma000
   189  	extrd,u		m032, 31, 32, ma064
   190  C	addib,=		-1, %r5, L(0_out)
   191  	depd		m096, 31, 32, ma064
   192  LDEF(loop0)
   193  C	xmpyu		%fr8R, %fr4L, %fr22
   194  C	xmpyu		%fr8L, %fr4R, %fr23
   195  C	ldd		-0x78(%r30), p032a1
   196  C	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
   197  C
   198  C	xmpyu		%fr8R, %fr4R, %fr24
   199  C	xmpyu		%fr8L, %fr4L, %fr25
   200  C	ldd		-0x70(%r30), p032a2
   201  C	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
   202  C
   203  C	ldo		8(rp), rp
   204  C	add		climb, p000a, s000
   205  C	ldd		-0x80(%r30), p000a
   206  C	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
   207  C
   208  C	add,dc		p064a, %r0, climb
   209  C	ldo		8(up), up
   210  C	ldd		-0x68(%r30), p064a
   211  C	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
   212  C
   213  C	add		ma000, s000, s000
   214  C	add,dc		ma064, climb, climb
   215  C	fldd		0(up), %fr4
   216  C
   217  C	std		s000, -8(rp)
   218  C
   219  C	add		p032a1, p032a2, m032
   220  C	add,dc		%r0, %r0, m096
   221  C
   222  C	depd,z		m032, 31, 32, ma000
   223  C	extrd,u		m032, 31, 32, ma064
   224  C	addib,<>	-1, %r5, L(loop0)
   225  C	depd		m096, 31, 32, ma064
   226  LDEF(0_out)
   227  	ldo		8(up), up
   228  	xmpyu		%fr8R, %fr4L, %fr22
   229  	xmpyu		%fr8L, %fr4R, %fr23
   230  	ldd		-0x78(%r30), p032a1
   231  	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
   232  	xmpyu		%fr8R, %fr4R, %fr24
   233  	xmpyu		%fr8L, %fr4L, %fr25
   234  	ldd		-0x70(%r30), p032a2
   235  	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
   236  	ldo		8(rp), rp
   237  	add		climb, p000a, s000
   238  	ldd		-0x80(%r30), p000a
   239  	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
   240  	add,dc		p064a, %r0, climb
   241  	ldd		-0x68(%r30), p064a
   242  	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
   243  	add		ma000, s000, s000
   244  	add,dc		ma064, climb, climb
   245  	std		s000, -8(rp)
   246  	add		p032a1, p032a2, m032
   247  	add,dc		%r0, %r0, m096
   248  	depd,z		m032, 31, 32, ma000
   249  	extrd,u		m032, 31, 32, ma064
   250  	depd		m096, 31, 32, ma064
   251  LDEF(0_two_out)
   252  	ldd		-0x78(%r30), p032a1
   253  	ldd		-0x70(%r30), p032a2
   254  	ldo		8(rp), rp
   255  	add		climb, p000a, s000
   256  	ldd		-0x80(%r30), p000a
   257  	add,dc		p064a, %r0, climb
   258  	ldd		-0x68(%r30), p064a
   259  	add		ma000, s000, s000
   260  	add,dc		ma064, climb, climb
   261  	std		s000, -8(rp)
   262  LDEF(0_one_out)
   263  	add		p032a1, p032a2, m032
   264  	add,dc		%r0, %r0, m096
   265  	depd,z		m032, 31, 32, ma000
   266  	extrd,u		m032, 31, 32, ma064
   267  	depd		m096, 31, 32, ma064
   268  
   269  	add		climb, p000a, s000
   270  	add,dc		p064a, %r0, climb
   271  	add		ma000, s000, s000
   272  	add,dc		ma064, climb, climb
   273  	std		s000, 0(rp)
   274  
   275  	cmpib,>=	4, n, L(done)
   276  	ldo		8(rp), rp
   277  
   278  C 4-way unrolled code.
   279  
   280  LDEF(BIG)
   281  
   282  define(`p032a1',`%r1')	C
   283  define(`p032a2',`%r19')	C
   284  define(`p096b1',`%r20')	C
   285  define(`p096b2',`%r21')	C
   286  define(`p160c1',`%r22')	C
   287  define(`p160c2',`%r29')	C
   288  define(`p224d1',`%r31')	C
   289  define(`p224d2',`%r3')	C
   290  			C
   291  define(`m032',`%r4')	C
   292  define(`m096',`%r5')	C
   293  define(`m160',`%r6')	C
   294  define(`m224',`%r7')	C
   295  define(`m288',`%r8')	C
   296  			C
   297  define(`p000a',`%r1')	C
   298  define(`p064a',`%r19')	C
   299  define(`p064b',`%r20')	C
   300  define(`p128b',`%r21')	C
   301  define(`p128c',`%r22')	C
   302  define(`p192c',`%r29')	C
   303  define(`p192d',`%r31')	C
   304  define(`p256d',`%r3')	C
   305  			C
   306  define(`s000',`%r10')	C
   307  define(`s064',`%r11')	C
   308  define(`s128',`%r12')	C
   309  define(`s192',`%r13')	C
   310  			C
   311  define(`ma000',`%r9')	C
   312  define(`ma064',`%r4')	C
   313  define(`ma128',`%r5')	C
   314  define(`ma192',`%r6')	C
   315  define(`ma256',`%r7')	C
   316  
   317  	std		%r6, -0xe8(%r30)
   318  	std		%r7, -0xe0(%r30)
   319  	std		%r8, -0xd8(%r30)
   320  	std		%r9, -0xd0(%r30)
   321  	std		%r10, -0xc8(%r30)
   322  	std		%r11, -0xc0(%r30)
   323  	std		%r12, -0xb8(%r30)
   324  	std		%r13, -0xb0(%r30)
   325  
   326  ifdef(`HAVE_ABI_2_0w',
   327  `	extrd,u		n, 61, 62, n		C right shift 2
   328  ',`	extrd,u		n, 61, 30, n		C right shift 2, zero extend
   329  ')
   330  
   331  LDEF(4_or_more)
   332  	fldd		0(up), %fr4
   333  	fldd		8(up), %fr5
   334  	fldd		16(up), %fr6
   335  	fldd		24(up), %fr7
   336  	xmpyu		%fr8R, %fr4L, %fr22
   337  	xmpyu		%fr8L, %fr4R, %fr23
   338  	xmpyu		%fr8R, %fr5L, %fr24
   339  	xmpyu		%fr8L, %fr5R, %fr25
   340  	xmpyu		%fr8R, %fr6L, %fr26
   341  	xmpyu		%fr8L, %fr6R, %fr27
   342  	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
   343  	xmpyu		%fr8R, %fr7L, %fr28
   344  	xmpyu		%fr8L, %fr7R, %fr29
   345  	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
   346  	xmpyu		%fr8R, %fr4R, %fr30
   347  	xmpyu		%fr8L, %fr4L, %fr31
   348  	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
   349  	xmpyu		%fr8R, %fr5R, %fr22
   350  	xmpyu		%fr8L, %fr5L, %fr23
   351  	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
   352  	xmpyu		%fr8R, %fr6R, %fr24
   353  	xmpyu		%fr8L, %fr6L, %fr25
   354  	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
   355  	xmpyu		%fr8R, %fr7R, %fr26
   356  	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
   357  	addib,<>	-1, n, L(8_or_more)
   358  	xmpyu		%fr8L, %fr7L, %fr27
   359  	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
   360  	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
   361  	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
   362  	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
   363  	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
   364  	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
   365  	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
   366  	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
   367  	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
   368  	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
   369  	ldd		-0x78(%r30), p032a1
   370  	ldd		-0x70(%r30), p032a2
   371  	ldd		-0x38(%r30), p096b1
   372  	ldd		-0x30(%r30), p096b2
   373  	ldd		-0x58(%r30), p160c1
   374  	ldd		-0x50(%r30), p160c2
   375  	ldd		-0x18(%r30), p224d1
   376  	ldd		-0x10(%r30), p224d2
   377  	b		L(end1)
   378  	nop
   379  
   380  LDEF(8_or_more)
   381  	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
   382  	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
   383  	ldo		32(up), up
   384  	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
   385  	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
   386  	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
   387  	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
   388  	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
   389  	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
   390  	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
   391  	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
   392  	fldd		0(up), %fr4
   393  	fldd		8(up), %fr5
   394  	fldd		16(up), %fr6
   395  	fldd		24(up), %fr7
   396  	xmpyu		%fr8R, %fr4L, %fr22
   397  	ldd		-0x78(%r30), p032a1
   398  	xmpyu		%fr8L, %fr4R, %fr23
   399  	xmpyu		%fr8R, %fr5L, %fr24
   400  	ldd		-0x70(%r30), p032a2
   401  	xmpyu		%fr8L, %fr5R, %fr25
   402  	xmpyu		%fr8R, %fr6L, %fr26
   403  	ldd		-0x38(%r30), p096b1
   404  	xmpyu		%fr8L, %fr6R, %fr27
   405  	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
   406  	xmpyu		%fr8R, %fr7L, %fr28
   407  	ldd		-0x30(%r30), p096b2
   408  	xmpyu		%fr8L, %fr7R, %fr29
   409  	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
   410  	xmpyu		%fr8R, %fr4R, %fr30
   411  	ldd		-0x58(%r30), p160c1
   412  	xmpyu		%fr8L, %fr4L, %fr31
   413  	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
   414  	xmpyu		%fr8R, %fr5R, %fr22
   415  	ldd		-0x50(%r30), p160c2
   416  	xmpyu		%fr8L, %fr5L, %fr23
   417  	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
   418  	xmpyu		%fr8R, %fr6R, %fr24
   419  	ldd		-0x18(%r30), p224d1
   420  	xmpyu		%fr8L, %fr6L, %fr25
   421  	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
   422  	xmpyu		%fr8R, %fr7R, %fr26
   423  	ldd		-0x10(%r30), p224d2
   424  	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
   425  	addib,=		-1, n, L(end2)
   426  	xmpyu		%fr8L, %fr7L, %fr27
   427  LDEF(loop)
   428  	add		p032a1, p032a2, m032
   429  	ldd		-0x80(%r30), p000a
   430  	add,dc		p096b1, p096b2, m096
   431  	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
   432  
   433  	add,dc		p160c1, p160c2, m160
   434  	ldd		-0x68(%r30), p064a
   435  	add,dc		p224d1, p224d2, m224
   436  	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
   437  
   438  	add,dc		%r0, %r0, m288
   439  	ldd		-0x40(%r30), p064b
   440  	ldo		32(up), up
   441  	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
   442  
   443  	depd,z		m032, 31, 32, ma000
   444  	ldd		-0x28(%r30), p128b
   445  	extrd,u		m032, 31, 32, ma064
   446  	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
   447  
   448  	depd		m096, 31, 32, ma064
   449  	ldd		-0x60(%r30), p128c
   450  	extrd,u		m096, 31, 32, ma128
   451  	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
   452  
   453  	depd		m160, 31, 32, ma128
   454  	ldd		-0x48(%r30), p192c
   455  	extrd,u		m160, 31, 32, ma192
   456  	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
   457  
   458  	depd		m224, 31, 32, ma192
   459  	ldd		-0x20(%r30), p192d
   460  	extrd,u		m224, 31, 32, ma256
   461  	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
   462  
   463  	depd		m288, 31, 32, ma256
   464  	ldd		-0x88(%r30), p256d
   465  	add		climb, p000a, s000
   466  	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
   467  
   468  	add,dc		p064a, p064b, s064
   469  	add,dc		p128b, p128c, s128
   470  	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
   471  
   472  	add,dc		p192c, p192d, s192
   473  	add,dc		p256d, %r0, climb
   474  	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
   475  
   476  	add		ma000, s000, s000	C accum mid 0
   477  	fldd		0(up), %fr4
   478  	add,dc		ma064, s064, s064	C accum mid 1
   479  	std		s000, 0(rp)
   480  
   481  	add,dc		ma128, s128, s128	C accum mid 2
   482  	fldd		8(up), %fr5
   483  	add,dc		ma192, s192, s192	C accum mid 3
   484  	std		s064, 8(rp)
   485  
   486  	add,dc		ma256, climb, climb
   487  	fldd		16(up), %fr6
   488  	std		s128, 16(rp)
   489  
   490  	xmpyu		%fr8R, %fr4L, %fr22
   491  	ldd		-0x78(%r30), p032a1
   492  	xmpyu		%fr8L, %fr4R, %fr23
   493  	fldd		24(up), %fr7
   494  
   495  	xmpyu		%fr8R, %fr5L, %fr24
   496  	ldd		-0x70(%r30), p032a2
   497  	xmpyu		%fr8L, %fr5R, %fr25
   498  	std		s192, 24(rp)
   499  
   500  	xmpyu		%fr8R, %fr6L, %fr26
   501  	ldd		-0x38(%r30), p096b1
   502  	xmpyu		%fr8L, %fr6R, %fr27
   503  	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
   504  
   505  	xmpyu		%fr8R, %fr7L, %fr28
   506  	ldd		-0x30(%r30), p096b2
   507  	xmpyu		%fr8L, %fr7R, %fr29
   508  	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
   509  
   510  	xmpyu		%fr8R, %fr4R, %fr30
   511  	ldd		-0x58(%r30), p160c1
   512  	xmpyu		%fr8L, %fr4L, %fr31
   513  	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
   514  
   515  	xmpyu		%fr8R, %fr5R, %fr22
   516  	ldd		-0x50(%r30), p160c2
   517  	xmpyu		%fr8L, %fr5L, %fr23
   518  	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
   519  
   520  	xmpyu		%fr8R, %fr6R, %fr24
   521  	ldd		-0x18(%r30), p224d1
   522  	xmpyu		%fr8L, %fr6L, %fr25
   523  	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
   524  
   525  	xmpyu		%fr8R, %fr7R, %fr26
   526  	ldd		-0x10(%r30), p224d2
   527  	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
   528  	xmpyu		%fr8L, %fr7L, %fr27
   529  
   530  	addib,<>	-1, n, L(loop)
   531  	ldo		32(rp), rp
   532  
   533  LDEF(end2)
   534  	add		p032a1, p032a2, m032
   535  	ldd		-0x80(%r30), p000a
   536  	add,dc		p096b1, p096b2, m096
   537  	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
   538  	add,dc		p160c1, p160c2, m160
   539  	ldd		-0x68(%r30), p064a
   540  	add,dc		p224d1, p224d2, m224
   541  	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
   542  	add,dc		%r0, %r0, m288
   543  	ldd		-0x40(%r30), p064b
   544  	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
   545  	depd,z		m032, 31, 32, ma000
   546  	ldd		-0x28(%r30), p128b
   547  	extrd,u		m032, 31, 32, ma064
   548  	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
   549  	depd		m096, 31, 32, ma064
   550  	ldd		-0x60(%r30), p128c
   551  	extrd,u		m096, 31, 32, ma128
   552  	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
   553  	depd		m160, 31, 32, ma128
   554  	ldd		-0x48(%r30), p192c
   555  	extrd,u		m160, 31, 32, ma192
   556  	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
   557  	depd		m224, 31, 32, ma192
   558  	ldd		-0x20(%r30), p192d
   559  	extrd,u		m224, 31, 32, ma256
   560  	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
   561  	depd		m288, 31, 32, ma256
   562  	ldd		-0x88(%r30), p256d
   563  	add		climb, p000a, s000
   564  	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
   565  	add,dc		p064a, p064b, s064
   566  	add,dc		p128b, p128c, s128
   567  	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
   568  	add,dc		p192c, p192d, s192
   569  	add,dc		p256d, %r0, climb
   570  	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
   571  	add		ma000, s000, s000	C accum mid 0
   572  	add,dc		ma064, s064, s064	C accum mid 1
   573  	add,dc		ma128, s128, s128	C accum mid 2
   574  	add,dc		ma192, s192, s192	C accum mid 3
   575  	add,dc		ma256, climb, climb
   576  	std		s000, 0(rp)
   577  	std		s064, 8(rp)
   578  	ldd		-0x78(%r30), p032a1
   579  	std		s128, 16(rp)
   580  	ldd		-0x70(%r30), p032a2
   581  	std		s192, 24(rp)
   582  	ldd		-0x38(%r30), p096b1
   583  	ldd		-0x30(%r30), p096b2
   584  	ldd		-0x58(%r30), p160c1
   585  	ldd		-0x50(%r30), p160c2
   586  	ldd		-0x18(%r30), p224d1
   587  	ldd		-0x10(%r30), p224d2
   588  	ldo		32(rp), rp
   589  
   590  LDEF(end1)
   591  	add		p032a1, p032a2, m032
   592  	ldd		-0x80(%r30), p000a
   593  	add,dc		p096b1, p096b2, m096
   594  	add,dc		p160c1, p160c2, m160
   595  	ldd		-0x68(%r30), p064a
   596  	add,dc		p224d1, p224d2, m224
   597  	add,dc		%r0, %r0, m288
   598  	ldd		-0x40(%r30), p064b
   599  	depd,z		m032, 31, 32, ma000
   600  	ldd		-0x28(%r30), p128b
   601  	extrd,u		m032, 31, 32, ma064
   602  	depd		m096, 31, 32, ma064
   603  	ldd		-0x60(%r30), p128c
   604  	extrd,u		m096, 31, 32, ma128
   605  	depd		m160, 31, 32, ma128
   606  	ldd		-0x48(%r30), p192c
   607  	extrd,u		m160, 31, 32, ma192
   608  	depd		m224, 31, 32, ma192
   609  	ldd		-0x20(%r30), p192d
   610  	extrd,u		m224, 31, 32, ma256
   611  	depd		m288, 31, 32, ma256
   612  	ldd		-0x88(%r30), p256d
   613  	add		climb, p000a, s000
   614  	add,dc		p064a, p064b, s064
   615  	add,dc		p128b, p128c, s128
   616  	add,dc		p192c, p192d, s192
   617  	add,dc		p256d, %r0, climb
   618  	add		ma000, s000, s000	C accum mid 0
   619  	add,dc		ma064, s064, s064	C accum mid 1
   620  	add,dc		ma128, s128, s128	C accum mid 2
   621  	add,dc		ma192, s192, s192	C accum mid 3
   622  	add,dc		ma256, climb, climb
   623  	std		s000, 0(rp)
   624  	std		s064, 8(rp)
   625  	std		s128, 16(rp)
   626  	std		s192, 24(rp)
   627  
   628  	ldd		-0xb0(%r30), %r13
   629  	ldd		-0xb8(%r30), %r12
   630  	ldd		-0xc0(%r30), %r11
   631  	ldd		-0xc8(%r30), %r10
   632  	ldd		-0xd0(%r30), %r9
   633  	ldd		-0xd8(%r30), %r8
   634  	ldd		-0xe0(%r30), %r7
   635  	ldd		-0xe8(%r30), %r6
   636  LDEF(done)
   637  ifdef(`HAVE_ABI_2_0w',
   638  `	copy		climb, %r28
   639  ',`	extrd,u		climb, 63, 32, %r29
   640  	extrd,u		climb, 31, 32, %r28
   641  ')
   642  	ldd		-0xf0(%r30), %r5
   643  	ldd		-0xf8(%r30), %r4
   644  	bve		(%r2)
   645  	ldd,mb		-0x100(%r30), %r3
   646  EPILOGUE(mpn_mul_1)