github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/pa64/sqr_diagonal.asm (about)

     1  dnl  HP-PA 2.0 64-bit mpn_sqr_diagonal.
     2  
     3  dnl  Copyright 2001-2003 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  
    32  dnl  This code runs at 7.25 cycles/limb on PA8000 and 7.75 cycles/limb on
    33  dnl  PA8500.  The cache would saturate at 5 cycles/limb, so there is some room
    34  dnl  for optimization.
    35  
    36  include(`../config.m4')
    37  
    38  C INPUT PARAMETERS
    39  define(`rp',`%r26')
    40  define(`up',`%r25')
    41  define(`n',`%r24')
    42  
    43  define(`p00',`%r28')
    44  define(`p32',`%r29')
    45  define(`p64',`%r31')
    46  define(`t0',`%r19')
    47  define(`t1',`%r20')
    48  
    49  ifdef(`HAVE_ABI_2_0w',
    50  `	.level	2.0w
    51  ',`	.level	2.0
    52  ')
    53  PROLOGUE(mpn_sqr_diagonal)
    54  	ldo		128(%r30),%r30
    55  
    56  	fldds,ma	8(up),%fr8
    57  	addib,=		-1,n,L(end1)
    58  	nop
    59  	fldds,ma	8(up),%fr4
    60  	xmpyu		%fr8l,%fr8r,%fr10
    61  	fstd		%fr10,-120(%r30)
    62  	xmpyu		%fr8r,%fr8r,%fr9
    63  	fstd		%fr9,0(rp)
    64  	xmpyu		%fr8l,%fr8l,%fr11
    65  	fstd		%fr11,8(rp)
    66  	addib,=		-1,n,L(end2)
    67  	ldo		16(rp),rp
    68  
    69  LDEF(loop)
    70  	fldds,ma	8(up),%fr8		C load next up limb
    71  	xmpyu		%fr4l,%fr4r,%fr6
    72  	fstd		%fr6,-128(%r30)
    73  	xmpyu		%fr4r,%fr4r,%fr5	C multiply in fp regs
    74  	fstd		%fr5,0(rp)
    75  	xmpyu		%fr4l,%fr4l,%fr7
    76  	fstd		%fr7,8(rp)
    77  	ldd		-120(%r30),p32
    78  	ldd		-16(rp),p00		C accumulate in int regs
    79  	ldd		-8(rp),p64
    80  	depd,z		p32,30,31,t0
    81  	add		t0,p00,p00
    82  	std		p00,-16(rp)
    83  	extrd,u		p32,32,33,t1
    84  	add,dc		t1,p64,p64
    85  	std		p64,-8(rp)
    86  	addib,=		-1,n,L(exit)
    87  	ldo		16(rp),rp
    88  
    89  	fldds,ma	8(up),%fr4
    90  	xmpyu		%fr8l,%fr8r,%fr10
    91  	fstd		%fr10,-120(%r30)
    92  	xmpyu		%fr8r,%fr8r,%fr9
    93  	fstd		%fr9,0(rp)
    94  	xmpyu		%fr8l,%fr8l,%fr11
    95  	fstd		%fr11,8(rp)
    96  	ldd		-128(%r30),p32
    97  	ldd		-16(rp),p00
    98  	ldd		-8(rp),p64
    99  	depd,z		p32,30,31,t0
   100  	add		t0,p00,p00
   101  	std		p00,-16(rp)
   102  	extrd,u		p32,32,33,t1
   103  	add,dc		t1,p64,p64
   104  	std		p64,-8(rp)
   105  	addib,<>	-1,n,L(loop)
   106  	ldo		16(rp),rp
   107  
   108  LDEF(end2)
   109  	xmpyu		%fr4l,%fr4r,%fr6
   110  	fstd		%fr6,-128(%r30)
   111  	xmpyu		%fr4r,%fr4r,%fr5
   112  	fstd		%fr5,0(rp)
   113  	xmpyu		%fr4l,%fr4l,%fr7
   114  	fstd		%fr7,8(rp)
   115  	ldd		-120(%r30),p32
   116  	ldd		-16(rp),p00
   117  	ldd		-8(rp),p64
   118  	depd,z		p32,30,31,t0
   119  	add		t0,p00,p00
   120  	std		p00,-16(rp)
   121  	extrd,u		p32,32,33,t1
   122  	add,dc		t1,p64,p64
   123  	std		p64,-8(rp)
   124  	ldo		16(rp),rp
   125  	ldd		-128(%r30),p32
   126  	ldd		-16(rp),p00
   127  	ldd		-8(rp),p64
   128  	depd,z		p32,30,31,t0
   129  	add		t0,p00,p00
   130  	std		p00,-16(rp)
   131  	extrd,u		p32,32,33,t1
   132  	add,dc		t1,p64,p64
   133  	std		p64,-8(rp)
   134  	bve		(%r2)
   135  	ldo		-128(%r30),%r30
   136  
   137  LDEF(exit)
   138  	xmpyu		%fr8l,%fr8r,%fr10
   139  	fstd		%fr10,-120(%r30)
   140  	xmpyu		%fr8r,%fr8r,%fr9
   141  	fstd		%fr9,0(rp)
   142  	xmpyu		%fr8l,%fr8l,%fr11
   143  	fstd		%fr11,8(rp)
   144  	ldd		-128(%r30),p32
   145  	ldd		-16(rp),p00
   146  	ldd		-8(rp),p64
   147  	depd,z		p32,31,32,t0
   148  	add		t0,p00,p00
   149  	extrd,u		p32,31,32,t1
   150  	add,dc		t1,p64,p64
   151  	add		t0,p00,p00
   152  	add,dc		t1,p64,p64
   153  	std		p00,-16(rp)
   154  	std		p64,-8(rp)
   155  	ldo		16(rp),rp
   156  	ldd		-120(%r30),p32
   157  	ldd		-16(rp),p00
   158  	ldd		-8(rp),p64
   159  	depd,z		p32,31,32,t0
   160  	add		t0,p00,p00
   161  	extrd,u		p32,31,32,t1
   162  	add,dc		t1,p64,p64
   163  	add		t0,p00,p00
   164  	add,dc		t1,p64,p64
   165  	std		p00,-16(rp)
   166  	std		p64,-8(rp)
   167  	bve		(%r2)
   168  	ldo		-128(%r30),%r30
   169  
   170  LDEF(end1)
   171  	xmpyu		%fr8l,%fr8r,%fr10
   172  	fstd		%fr10,-128(%r30)
   173  	xmpyu		%fr8r,%fr8r,%fr9
   174  	fstd		%fr9,0(rp)
   175  	xmpyu		%fr8l,%fr8l,%fr11
   176  	fstd		%fr11,8(rp)
   177  	ldo		16(rp),rp
   178  	ldd		-128(%r30),p32
   179  	ldd		-16(rp),p00
   180  	ldd		-8(rp),p64
   181  	depd,z		p32,31,32,t0
   182  	add		t0,p00,p00
   183  	extrd,u		p32,31,32,t1
   184  	add,dc		t1,p64,p64
   185  	add		t0,p00,p00
   186  	add,dc		t1,p64,p64
   187  	std		p00,-16(rp)
   188  	std		p64,-8(rp)
   189  	bve		(%r2)
   190  	ldo		-128(%r30),%r30
   191  EPILOGUE(mpn_sqr_diagonal)