github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/powerpc64/mode64/aorsorrlshC_n.asm (about)

     1  dnl  PowerPC-64 mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n.
     2  
     3  dnl  Copyright 2003, 2005, 2009, 2010, 2013 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  C                  cycles/limb
    32  C POWER3/PPC630          1.83   (1.5 c/l should be possible)
    33  C POWER4/PPC970          3      (2.0 c/l should be possible)
    34  C POWER5                 3
    35  C POWER6              3.5-47
    36  C POWER7                 3
    37  
    38  C STATUS
    39  C  * Try combining upx+up, and vpx+vp.
    40  C  * The worst case 47 c/l for POWER6 happens if the 3rd operand for ldx is
    41  C    greater than the 2nd operand.  Yes, this addition is non-commutative wrt
    42  C    performance.
    43  
    44  C INPUT PARAMETERS
    45  define(`rp', `r3')
    46  define(`up', `r4')
    47  define(`vp', `r5')
    48  define(`n',  `r6')
    49  
    50  ifdef(`DO_add', `
    51    define(`ADDSUBC',	`addc	$1, $2, $3')
    52    define(`ADDSUBE',	`adde	$1, $2, $3')
    53    define(INITCY,	`addic	$1, r1, 0')
    54    define(RETVAL,	`addze	r3, $1')
    55    define(`func',	mpn_addlsh`'LSH`'_n)')
    56  ifdef(`DO_sub', `
    57    define(`ADDSUBC',	`subfc	$1, $2, $3')
    58    define(`ADDSUBE',	`subfe	$1, $2, $3')
    59    define(INITCY,	`addic	$1, r1, -1')
    60    define(RETVAL,	`subfze	r3, $1
    61  			neg	r3, r3')
    62    define(`func',	mpn_sublsh`'LSH`'_n)')
    63  ifdef(`DO_rsb', `
    64    define(`ADDSUBC',	`subfc	$1, $3, $2')
    65    define(`ADDSUBE',	`subfe	$1, $3, $2')
    66    define(INITCY,	`addic	$1, r1, -1')
    67    define(RETVAL,	`addme	r3, $1')
    68    define(`func',	mpn_rsblsh`'LSH`'_n)')
    69  
    70  define(`rpx', `r6')
    71  define(`upx', `r7')
    72  define(`vpx', `r12')
    73  
    74  define(`s0', `r0')  define(`s1', `r9')
    75  define(`u0', `r8')
    76  define(`v0', `r10') define(`v1', `r11')
    77  
    78  
    79  ASM_START()
    80  PROLOGUE(func)
    81  	cmpldi	cr0, n, 13
    82  	bgt	L(big)
    83  
    84  	mtctr	n		C copy n in ctr
    85  	INITCY(	r0)		C clear cy
    86  
    87  	ld	v0, 0(vp)	C load v limb
    88  	ld	u0, 0(up)	C load u limb
    89  	addi	up, up, -8	C update up
    90  	addi	rp, rp, -8	C update rp
    91  	sldi	s1, v0, LSH
    92  	bdz	L(ex1)		C If done, skip loop
    93  
    94  	ALIGN(16)
    95  L(lo0):	ld	v1, 8(vp)	C load v limb
    96  	ADDSUBE(s1, s1, u0)	C add limbs with cy, set cy
    97  	ldu	u0, 16(up)	C load u limb and update up
    98  	srdi	s0, v0, RSH	C shift down previous v limb
    99  	std	s1, 8(rp)	C store result limb
   100  	rldimi	s0, v1, LSH, 0	C left shift v limb and merge with prev v limb
   101  	bdz	L(ex0)		C decrement ctr and exit if done
   102  	ldu	v0, 16(vp)	C load v limb and update vp
   103  	ADDSUBE(s0, s0, u0)	C add limbs with cy, set cy
   104  	ld	u0, 8(up)	C load u limb
   105  	srdi	s1, v1, RSH	C shift down previous v limb
   106  	stdu	s0, 16(rp)	C store result limb and update rp
   107  	rldimi	s1, v0, LSH, 0	C left shift v limb and merge with prev v limb
   108  	bdnz	L(lo0)		C decrement ctr and loop back
   109  
   110  L(ex1):	ADDSUBE(r7, s1, u0)
   111  	std	r7, 8(rp)	C store last result limb
   112  	srdi	r0, v0, RSH
   113  	RETVAL(	r0)
   114  	blr
   115  L(ex0):	ADDSUBE(r7, s0, u0)
   116  	std	r7, 16(rp)	C store last result limb
   117  	srdi	r0, v1, RSH
   118  	RETVAL(	r0)
   119  	blr
   120  
   121  
   122  L(big):	rldicl.	r0, n, 0,63	C r0 = n & 1, set cr0
   123  	addi	r6, n, -1	C ...for ctr
   124  	srdi	r6, r6, 1	C ...for ctr
   125  	mtctr	r6		C copy count into ctr
   126  	beq	cr0, L(b0)
   127  
   128  L(b1):	ld	v1, 0(vp)
   129  	ld	u0, 0(up)
   130  	sldi	s1, v1, LSH
   131  	srdi	s0, v1, RSH
   132  	ld	v0, 8(vp)
   133  	ADDSUBC(s1, s1, u0)	C add limbs without cy, set cy
   134  	addi	rpx, rp, -16
   135  	addi	rp, rp, -8
   136  	sub	upx, up, rp
   137  	sub	vpx, vp, rp
   138  	sub	up, up, rpx
   139  	sub	vp, vp, rpx
   140  	addi	up, up, 8
   141  	addi	upx, upx, 16
   142  	addi	vp, vp, 16
   143  	addi	vpx, vpx, 24
   144  	b	L(mid)
   145  
   146  L(b0):	ld	v0, 0(vp)
   147  	ld	u0, 0(up)
   148  	sldi	s0, v0, LSH
   149  	srdi	s1, v0, RSH
   150  	ld	v1, 8(vp)
   151  	ADDSUBC(s0, s0, u0)	C add limbs without cy, set cy
   152  	addi	rpx, rp, -8
   153  	addi	rp, rp, -16
   154  	sub	upx, up, rpx
   155  	sub	vpx, vp, rpx
   156  	sub	up, up, rp
   157  	sub	vp, vp, rp
   158  	addi	up, up, 8
   159  	addi	upx, upx, 16
   160  	addi	vp, vp, 16
   161  	addi	vpx, vpx, 24
   162  
   163  	ALIGN(32)
   164  L(top):	ldx	u0, rp, up
   165  	ldx	v0, rp, vp
   166  	rldimi	s1, v1, LSH, 0
   167  	stdu	s0, 16(rp)
   168  	srdi	s0, v1, RSH
   169  	ADDSUBE(s1, s1, u0)	C add limbs with cy, set cy
   170  L(mid):	ldx	u0, rpx, upx
   171  	ldx	v1, rpx, vpx
   172  	rldimi	s0, v0, LSH, 0
   173  	stdu	s1, 16(rpx)
   174  	srdi	s1, v0, RSH
   175  	ADDSUBE(s0, s0, u0)	C add limbs with cy, set cy
   176  	bdnz	L(top)		C decrement CTR and loop back
   177  
   178  	ldx	u0, rp, up
   179  	rldimi	s1, v1, LSH, 0
   180  	std	s0, 16(rp)
   181  	srdi	s0, v1, RSH
   182  	ADDSUBE(s1, s1, u0)	C add limbs with cy, set cy
   183  	std	s1, 24(rp)
   184  
   185  	RETVAL(	s0)
   186  	blr
   187  EPILOGUE()